# **Mount Google Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!ls '/content/drive/My Drive/Data Science/Project 0001'

Cleaned-Survey-Pizza.csv      Pizza-Survey-Preprocessing.ipynb	survey.txt
Pizza-Machine-Learning.ipynb  Pizza-Survey-Viz.ipynb


# **Import Libraries**

In [3]:
import pandas as pd
import numpy as np

# **Load Dataset**

In [4]:
raw_data = pd.read_csv('/content/drive/My Drive/Data Science/Project 0001/survey.txt')
raw_data

Unnamed: 0,T: ham,pineapple,mushroom,pepperoni,chicken,extra cheese,BBQ sauce,good pizza;
0,A: 0,0,0,0,0,0,1,1;
1,A: 1,0,0,1,0,0,1,1;
2,A: 1,0,0,1,0,0,1,1;
3,A: 0,0,1,0,1,0,0,0;
4,A: 1,0,1,0,1,0,1,0;
...,...,...,...,...,...,...,...,...
115,B: 1,1,0,0,1,1,1,1;
116,B: 1,1,0,1,0,0,0,0;
117,B: 1,1,0,0,0,1,1,1;
118,B: 0,0,0,1,1,1,0,0;


# **Dataset Preprocessing**

In [5]:
df = raw_data.copy() #Copy the original file to df
df.head()

Unnamed: 0,T: ham,pineapple,mushroom,pepperoni,chicken,extra cheese,BBQ sauce,good pizza;
0,A: 0,0,0,0,0,0,1,1;
1,A: 1,0,0,1,0,0,1,1;
2,A: 1,0,0,1,0,0,1,1;
3,A: 0,0,1,0,1,0,0,0;
4,A: 1,0,1,0,1,0,1,0;


In [6]:
df.shape #check the shape of DataFrame

(120, 8)

In [7]:
df.info() #Check the information of DataFrame

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   T: ham         120 non-null    object
 1    pineapple     120 non-null    int64 
 2    mushroom      120 non-null    int64 
 3    pepperoni     120 non-null    int64 
 4    chicken       120 non-null    int64 
 5    extra cheese  120 non-null    int64 
 6    BBQ sauce     120 non-null    int64 
 7    good pizza;   120 non-null    object
dtypes: int64(6), object(2)
memory usage: 7.6+ KB


In [8]:
df.iloc[:, 7] = df.iloc[:, 7].str.strip(';') #strip ";" on column "good pizza;"
df.head()

Unnamed: 0,T: ham,pineapple,mushroom,pepperoni,chicken,extra cheese,BBQ sauce,good pizza;
0,A: 0,0,0,0,0,0,1,1
1,A: 1,0,0,1,0,0,1,1
2,A: 1,0,0,1,0,0,1,1
3,A: 0,0,1,0,1,0,0,0
4,A: 1,0,1,0,1,0,1,0


In [9]:
t_ham = df.iloc[:,0] #Extact the Column "T: ham" from the DataFrame
t_ham.head()

0    A: 0
1    A: 1
2    A: 1
3    A: 0
4    A: 1
Name: T: ham, dtype: object

In [10]:
t_ham_split = t_ham.str.split(pat=': ', expand=True) #split the Values
t_ham_split.head()

Unnamed: 0,0,1
0,A,0
1,A,1
2,A,1
3,A,0
4,A,1


In [11]:
t_ham_split.shape

(120, 2)

In [12]:
t_ham_split.columns = ['Gender', 'Ham']     #Change the names of the columns
t_ham_split

Unnamed: 0,Gender,Ham
0,A,0
1,A,1
2,A,1
3,A,0
4,A,1
...,...,...
115,B,1
116,B,1
117,B,1
118,B,0


In [13]:
df = pd.concat([t_ham_split, df], axis=1)     #Combine the 2 DataFrame
df.head()

Unnamed: 0,Gender,Ham,T: ham,pineapple,mushroom,pepperoni,chicken,extra cheese,BBQ sauce,good pizza;
0,A,0,A: 0,0,0,0,0,0,1,1
1,A,1,A: 1,0,0,1,0,0,1,1
2,A,1,A: 1,0,0,1,0,0,1,1
3,A,0,A: 0,0,1,0,1,0,0,0
4,A,1,A: 1,0,1,0,1,0,1,0


In [14]:
df = df.drop(['T: ham'], axis=1)     #Drop the column 'T: ham'
df

Unnamed: 0,Gender,Ham,pineapple,mushroom,pepperoni,chicken,extra cheese,BBQ sauce,good pizza;
0,A,0,0,0,0,0,0,1,1
1,A,1,0,0,1,0,0,1,1
2,A,1,0,0,1,0,0,1,1
3,A,0,0,1,0,1,0,0,0
4,A,1,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...
115,B,1,1,0,0,1,1,1,1
116,B,1,1,0,1,0,0,0,0
117,B,1,1,0,0,0,1,1,1
118,B,0,0,0,1,1,1,0,0


In [15]:
df.columns.values     #Check the columns names

array(['Gender', 'Ham', ' pineapple', ' mushroom', ' pepperoni',
       ' chicken', ' extra cheese', ' BBQ sauce', ' good pizza;'],
      dtype=object)

In [16]:
column_names = ['Gender', 'Ham', 'Pineapple', 'Mushroom', 'Pepperoni', 'Chicken','Extra_Cheese', 'BBQ_Sauce', 'Pizza_Rating']    #Create list of column names

In [17]:
df.columns = column_names     #Update the column names

In [18]:
df.head()

Unnamed: 0,Gender,Ham,Pineapple,Mushroom,Pepperoni,Chicken,Extra_Cheese,BBQ_Sauce,Pizza_Rating
0,A,0,0,0,0,0,0,1,1
1,A,1,0,0,1,0,0,1,1
2,A,1,0,0,1,0,0,1,1
3,A,0,0,1,0,1,0,0,0
4,A,1,0,1,0,1,0,1,0


In [19]:
df['Gender'] = df['Gender'].map({'A': 1, 'B': 0})     # Change the values in column T: A = 0 and B = 1
df['Gender'].head()

0    1
1    1
2    1
3    1
4    1
Name: Gender, dtype: int64

In [20]:
df     #Cleaned DataFrame
df_final = df.copy()
df_final.head()

Unnamed: 0,Gender,Ham,Pineapple,Mushroom,Pepperoni,Chicken,Extra_Cheese,BBQ_Sauce,Pizza_Rating
0,1,0,0,0,0,0,0,1,1
1,1,1,0,0,1,0,0,1,1
2,1,1,0,0,1,0,0,1,1
3,1,0,0,1,0,1,0,0,0
4,1,1,0,1,0,1,0,1,0


In [21]:
df_final.to_csv('/content/drive/My Drive/Data Science/Project 0001/Cleaned-Survey-Pizza.csv', sep=',', index=False)