## Preparing the Speed Dating Dataset
As an entrepreneur, you are planning to launch a new dating app into the market. The key feature that will differentiate your app from other competitors will be your high performing user-matching algorithm. Before building this model, you have partnered with a speed dating company to collect data from real events. You just received the dataset from your partner company but realized it is not as clean as you expected; there are missing and incorrect values. Your task is to fix the main data quality issues in this dataset.

In [1]:
import pandas as pd

In [2]:
file_url = ('https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop'\
            '/master/Chapter11/dataset/Speed_Dating_Data.csv')

In [3]:
df = pd.read_csv(file_url)
print(df.shape)
df.head()

(8378, 195)


Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,1,10,7,,4,...,5.0,7.0,7.0,7.0,7.0,,,,,
1,1,1.0,0,1,1,1,10,7,,3,...,5.0,7.0,7.0,7.0,7.0,,,,,
2,1,1.0,0,1,1,1,10,7,,10,...,5.0,7.0,7.0,7.0,7.0,,,,,
3,1,1.0,0,1,1,1,10,7,,5,...,5.0,7.0,7.0,7.0,7.0,,,,,
4,1,1.0,0,1,1,1,10,7,,7,...,5.0,7.0,7.0,7.0,7.0,,,,,


In [4]:
# duplicated rows
df.duplicated().sum()

0

In [5]:
# duplicated rows for identifier columns (iid, id, partner, pid)
df.loc[df.duplicated(), ['iid', 'id', 'partner', 'pid']]

Unnamed: 0,iid,id,partner,pid


In [6]:
df['imprace'].unique()

array([ 2.,  8.,  1.,  4.,  7.,  3.,  9., 10., nan,  5.,  6.,  0.])

In [7]:
# key about data attributes provided by author says that values for imprace should be from 1-10. 
# observations where imprace is 0
df.loc[df['imprace'] == 0]

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
6452,419,4.0,0,7,1,16,8,4,4.0,4,...,,,,,,,,,,
6453,419,4.0,0,7,1,16,8,4,4.0,5,...,,,,,,,,,,
6454,419,4.0,0,7,1,16,8,4,4.0,8,...,,,,,,,,,,
6455,419,4.0,0,7,1,16,8,4,4.0,3,...,,,,,,,,,,
6456,419,4.0,0,7,1,16,8,4,4.0,6,...,,,,,,,,,,
6457,419,4.0,0,7,1,16,8,4,4.0,1,...,,,,,,,,,,
6458,419,4.0,0,7,1,16,8,4,4.0,7,...,,,,,,,,,,
6459,419,4.0,0,7,1,16,8,4,4.0,2,...,,,,,,,,,,


In [8]:
# 8 observations have 0 as the value for imprace. Replace these values with the value 1
# 1 is the closest number on the scale
df['imprace'] = df['imprace'].replace(0, 1)
df['imprace'].unique()

array([ 2.,  8.,  1.,  4.,  7.,  3.,  9., 10., nan,  5.,  6.])

In [9]:
# check for incorrect values for imprelig variable
df['imprelig'].unique()

array([ 4.,  5.,  1.,  3.,  2.,  8., 10.,  6., nan,  7.,  9.])

In [10]:
# check for incorrect values for sports variable
df['sports'].unique()

array([ 9.,  3.,  1.,  7., 10.,  5.,  2.,  4.,  8., nan,  6.])

In [11]:
# check for incorrect values for tvsports variable
df['tvsports'].unique()


array([ 2.,  8.,  1.,  4.,  3.,  9.,  7.,  6.,  5., 10., nan])

In [12]:
# check for incorrect values for exercise variable
df['exercise'].unique()


array([ 8.,  7.,  6.,  9.,  4.,  1.,  2.,  5., 10.,  3., nan])

In [13]:
# check for incorrect values for dining variable
df['dining'].unique()

array([ 9., 10.,  8.,  7.,  6.,  5., nan,  3.,  4.,  1.,  2.])

In [14]:
# check for incorrect values for museums variable
df['museums'].unique()

array([ 1.,  8.,  5.,  6., 10.,  9.,  7.,  3.,  4., nan,  2.,  0.])

In [15]:
# correct incorrect values for museums variable; approved values 1-10
df['museums'] = df['museums'].replace(0, 1)
# check to see if the incorrect values have been replaced.
df['museums'].unique()

array([ 1.,  8.,  5.,  6., 10.,  9.,  7.,  3.,  4., nan,  2.])

In [16]:
# check for incorrect values for art variable
df['art'].unique()


array([ 1.,  6.,  5.,  7.,  8., 10.,  9.,  3.,  4., nan,  2.,  0.])

In [17]:
# correct incorrect values for art variable; approved values 1-10
df['art'] = df['art'].replace(0, 1)
# check to see if the incorrect values have been replaced.
df['art'].unique()

array([ 1.,  6.,  5.,  7.,  8., 10.,  9.,  3.,  4., nan,  2.])

In [18]:
# check for incorrect values for hiking variable
df['hiking'].unique()

array([ 5.,  3.,  8.,  7.,  6.,  9.,  2.,  4., 10.,  1., nan,  0.])

In [19]:
# correct incorrect values for hiking variable; approved values 1-10
df['hiking'] = df['hiking'].replace(0, 1)
# check to see if the incorrect values have been replaced.
df['hiking'].unique()

array([ 5.,  3.,  8.,  7.,  6.,  9.,  2.,  4., 10.,  1., nan])

In [20]:
# check for incorrect values for gaming variable
df['gaming'].unique()

array([ 1.,  5.,  4.,  6.,  2.,  3.,  7.,  8., 10., nan,  9., 14.,  0.])

In [21]:
# correct incorrect values for gaming variable; approved values 1-10
df['gaming'] = df['gaming'].replace(0, 1)
df['gaming'] = df['gaming'].replace(14, 10)
# check to see if the incorrect values have been replaced.
df['gaming'].unique()

array([ 1.,  5.,  4.,  6.,  2.,  3.,  7.,  8., 10., nan,  9.])

In [22]:
# check for incorrect values for clubbing variable
df['clubbing'].unique()

array([ 5.,  8.,  7.,  6., 10.,  4.,  9.,  2.,  3., nan,  1.,  0.])

In [23]:
# correct incorrect values for clubbing variable; approved values 1-10
df['clubbing'] = df['clubbing'].replace(0, 1)
# check to see if the incorrect values have been replaced.
df['clubbing'].unique()

array([ 5.,  8.,  7.,  6., 10.,  4.,  9.,  2.,  3., nan,  1.])

In [24]:
# check for incorrect values for reading variable
df['reading'].unique()

array([ 6., 10.,  7.,  9.,  8.,  4.,  5., nan,  2.,  3.,  1., 13.])

In [25]:
# correct incorrect values for reading variable; approved values 1-10
df['reading'] = df['reading'].replace(13, 10)
# check to see if the incorrect values have been replaced.
df['reading'].unique()

array([ 6., 10.,  7.,  9.,  8.,  4.,  5., nan,  2.,  3.,  1.])

In [26]:
# check for incorrect values for tv variable
df['tv'].unique()

array([ 9.,  1.,  8.,  7.,  2., 10.,  5.,  6.,  3., nan,  4.])

In [27]:
# check for incorrect values for theater variable
df['theater'].unique()

array([ 1.,  9.,  7.,  6.,  5., 10.,  4.,  3.,  8., nan,  2.,  0.])

In [28]:
# correct incorrect values for theater variable; approved values 1-10
df['theater'] = df['theater'].replace(0, 1)
# check to see if the incorrect values have been replaced.
df['theater'].unique()

array([ 1.,  9.,  7.,  6.,  5., 10.,  4.,  3.,  8., nan,  2.])

In [29]:
# check for incorrect values for movies variable
df['movies'].unique()

array([10.,  8.,  7.,  6.,  9.,  5., nan,  4.,  3.,  2.,  0.])

In [30]:
# correct incorrect values for movies variable; approved values 1-10
df['movies'] = df['movies'].replace(0, 1)
# check to see if the incorrect values have been replaced.
df['movies'].unique()

array([10.,  8.,  7.,  6.,  9.,  5., nan,  4.,  3.,  2.,  1.])

In [31]:
# check for incorrect values for concerts variable
df['concerts'].unique()

array([10.,  7.,  8.,  3.,  6.,  9.,  4.,  5., nan,  2.,  1.,  0.])

In [32]:
# correct incorrect values for concerts variable; approved values 1-10
df['concerts'] = df['concerts'].replace(0, 1)
# check to see if the incorrect values have been replaced.
df['concerts'].unique()

array([10.,  7.,  8.,  3.,  6.,  9.,  4.,  5., nan,  2.,  1.])

In [33]:
# check for incorrect values for music variable
df['music'].unique()

array([ 9.,  8.,  5.,  7.,  4., 10.,  6., nan,  1.,  2.,  3.])

In [34]:
# check for incorrect values for shopping variable
df['shopping'].unique()

array([ 8.,  3.,  1., 10.,  7.,  5.,  6.,  2., nan,  9.,  4.])

In [35]:
# check for incorrect values for yoga variable
df['yoga'].unique()

array([ 1.,  7.,  8.,  3., 10.,  6.,  5.,  2.,  4., nan,  9.,  0.])

In [36]:
# correct incorrect values for yoga variable; approved values 1-10
df['yoga'] = df['yoga'].replace(0, 1)
# check to see if the incorrect values have been replaced.
df['yoga'].unique()

array([ 1.,  7.,  8.,  3., 10.,  6.,  5.,  2.,  4., nan,  9.])

### Check the data type of the different columns using .dtypes.

In [38]:
# data types of columns
df.dtypes

iid           int64
id          float64
gender        int64
idg           int64
condtn        int64
             ...   
attr5_3     float64
sinc5_3     float64
intel5_3    float64
fun5_3      float64
amb5_3      float64
Length: 195, dtype: object

### Change the data types to categorical for the columns that don't contain numerical values using .astype()


In [39]:
num_cols = ['round', 'order', 'int_corr', 'age', 'mn_sat', 'income', 'expnum']

In [40]:
# category column names
cat_cols = df.columns.difference(num_cols)

In [41]:
for col in cat_cols:
    df[col] = df[col].astype('category')

In [44]:
df.dtypes

iid         category
id          category
gender      category
idg         category
condtn      category
              ...   
attr5_3     category
sinc5_3     category
intel5_3    category
fun5_3      category
amb5_3      category
Length: 195, dtype: object

In [46]:
# missing values for each numerical column
df[num_cols].isna().sum()

round          0
order          0
int_corr     158
age           95
mn_sat      5245
income      4099
expnum      6578
dtype: int64

In [47]:
df['int_corr'].unique()

array([ 0.14,  0.54,  0.16,  0.61,  0.21,  0.25,  0.34,  0.5 ,  0.28,
       -0.36,  0.29,  0.18,  0.1 , -0.21,  0.32,  0.73,  0.6 ,  0.07,
        0.11,  0.39, -0.24, -0.14,  0.09, -0.04, -0.3 , -0.26, -0.15,
       -0.47, -0.18,  0.05,  0.37,  0.35,  0.15, -0.19, -0.43,  0.  ,
       -0.17,  0.08, -0.16,  0.06, -0.05, -0.13, -0.06,  0.33, -0.51,
        0.12,  0.19,  0.47,  0.03,  0.46,  0.43,  0.52, -0.46, -0.27,
        0.59,  0.31, -0.34, -0.03, -0.11,  0.42, -0.4 , -0.23,  0.17,
        0.68, -0.01, -0.35,  0.3 ,  0.65,  0.24,  0.41,  0.49,  0.01,
        0.22, -0.08,  0.27,  0.44,  0.62, -0.2 , -0.02, -0.33, -0.52,
       -0.1 ,  0.58, -0.57, -0.31, -0.07, -0.32,  0.04, -0.12,  0.48,
       -0.22, -0.29,  0.38,  0.53, -0.38,  0.02, -0.28,  0.13,  0.2 ,
         nan, -0.41, -0.44,  0.51, -0.48,  0.4 ,  0.26,  0.77, -0.49,
       -0.25, -0.09,  0.45, -0.39,  0.83,  0.57, -0.61,  0.72, -0.37,
        0.23, -0.58,  0.8 , -0.56,  0.63, -0.63,  0.71,  0.36,  0.56,
        0.55,  0.76,

In [51]:
# values range between -1 and 1. It seems they have been normalized. Imput missing values with the mean
df['int_corr'] = df['int_corr'].fillna(df['int_corr'].mean())

In [53]:
df[num_cols].isna().sum()

round          0
order          0
int_corr       0
age           95
mn_sat      5245
income      4099
expnum      6578
dtype: int64

In [55]:
missing_num_cols = ['age', 'mn_sat', 'income', 'expnum']

In [56]:
# for loop to iterate columns in missing_num_cols and print out their names and a list of their unique values
for col_name in missing_num_cols:
    print(col_name)
    print(df[col_name].unique())

age
[21. 24. 25. 23. 22. 26. 27. 30. 28. nan 29. 34. 35. 32. 39. 20. 19. 18.
 37. 33. 36. 31. 42. 38. 55.]
mn_sat
[  nan 1070. 1258. 1400. 1290. 1460. 1430. 1215. 1330. 1450. 1155. 1140.
 1360. 1402. 1250. 1210. 1220. 1410. 1260. 1380. 1030. 1309. 1308. 1050.
 1100. 1310. 1490. 1188. 1097. 1212. 1340. 1034. 1185. 1242. 1160. 1099.
 1214. 1270. 1110. 1178. 1060. 1157. 1180. 1014. 1341.  990. 1320. 1159.
 1370. 1105. 1365. 1011. 1130. 1206. 1331. 1191.  914. 1200. 1080. 1090.
 1092. 1470. 1149. 1134. 1230. 1267. 1280. 1227. 1239.]
income
[ 69487.  65929.     nan  37754.  86340.  60304.  54620.  48652.  29237.
  56580.  36782.  38548.  52010.  28418.  43185.  23152.  43664.  48441.
  61152.  36485.  41507.  17134.  30038.  33772.  24997.  42096.  28891.
  62635.  12063.  29809.  26482.  30147.  39919.  41466.  23988.  28989.
  50948.  38022.  47559.  53539.  32159.  53940.  40753.  38207.  46166.
  30973.  28317.  26645.  25589.  55223. 109031.  40409.  21597.  76624.
  35968.  51725.  55

In [62]:
# the values for the above columns have not been normalized and some have outliers. Use mean to fill missing values
for col_name in missing_num_cols:
    col_median = df[col_name].median()
    df[col_name].fillna(col_median, inplace=True)
    print('{}: {}'.format(col_name, col_median))

age: 26.0
mn_sat: 1310.0
income: 43185.0
expnum: 4.0


In [63]:
# print column and number of missing values
for col_name in missing_num_cols:
    print('{}: {}'.format(col_name, df[col_name].isna().sum()))

age: 0
mn_sat: 0
income: 0
expnum: 0
