In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import LabelEncoder, StandardScaler, \
    RobustScaler, Normalizer, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

# 1. Loading Data

In [2]:
# using adult data from UCI ML repository - https://archive.ics.uci.edu/ml/datasets/adult
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None)

In [3]:
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
               'marital-status', 'occupation', 'relationship', 'race', 'sex',
               'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
               'salary']

In [4]:
df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
salary            object
dtype: object

# 2. Descriptive Statistics

In [6]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [7]:
# skew
df.fnlwgt.skew()

1.4469800945789826

In [8]:
# kurtosis
df.fnlwgt.kurt()

6.218810978153801

In [9]:
# correlation for numeric columns
df[["age", "fnlwgt", "education-num", "capital-gain"]].corr()

Unnamed: 0,age,fnlwgt,education-num,capital-gain
age,1.0,-0.076646,0.036527,0.077674
fnlwgt,-0.076646,1.0,-0.043195,0.000432
education-num,0.036527,-0.043195,1.0,0.12263
capital-gain,0.077674,0.000432,0.12263,1.0


In [10]:
# value counts for catogorical variables
df.sex.value_counts()

 Male      21790
 Female    10771
Name: sex, dtype: int64

In [11]:
# unique values for catogorical variables
df.sex.unique()

array([' Male', ' Female'], dtype=object)

### How groupby works
  
Divide, aggrigate and combine. 
  
![title](images/group_by.jpg)

In [12]:
df.groupby("sex").fnlwgt.mean()

sex
 Female    185746.311206
 Male      191771.449013
Name: fnlwgt, dtype: float64

# 3. Pre Processing

### Missing value analysis

In [13]:
# check how many missing values in each column
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
salary            0
dtype: int64

In [14]:
# generate some missing values 

# a function to return NaN with 10% probability
def rna(x):
    if np.random.random() > 0.9:
        return np.NaN
    else:
        return x

df["capital-gain"] = df["capital-gain"].map(rna)
df["education-num"] = df["education-num"].map(rna)

In [15]:
df[["capital-gain", "education-num", "sex"]].isnull().sum()

capital-gain     3209
education-num    3187
sex                 0
dtype: int64

In [16]:
# impute missing values with either mean, mode or median
val1 = df["capital-gain"].mean()
val2 = df["education-num"].mode()

df["capital-gain"].fillna(val1, inplace=True)
df["education-num"].fillna(val2, inplace=True)

df[["capital-gain", "education-num"]].isnull().sum()

capital-gain        0
education-num    3187
dtype: int64

### Dropping columns

In [17]:
cols_to_drop = ["marital-status", "workclass", "native-country", "relationship", "race"]
df.drop(cols_to_drop, axis=1, inplace=True)

### Encoding catogorical variables

In [18]:
# label encoding

fields = ["education", "occupation", "salary"]
for field in fields:
    le = LabelEncoder()
    df[field] = le.fit_transform(df[field])

# encoded variable
df["occupation"].value_counts()

10    4140
3     4099
4     4066
1     3770
12    3650
8     3295
7     2002
0     1843
14    1597
6     1370
5      994
13     928
11     649
9      149
2        9
Name: occupation, dtype: int64

In [19]:
# one hot encoding

onehot_encode_cols = ["sex"]

for field in onehot_encode_cols:
    # make dummy variables for the data column
    dum = pd.get_dummies(df[field])

    # trim column names of any unwanted characters
    #dum.columns = np.array([i.strip() for i in dum.columns])
    dum.columns = dum.columns.str.strip().str.lower().str.replace(' ', '_')
    
    # add column name as a prefix
    dum = dum.add_prefix(field + "_")

    # delete column and add one hot encoded columns
    del df[field]
    df = pd.concat([dum, df], axis=1)

df.head()

Unnamed: 0,sex_female,sex_male,age,fnlwgt,education,education-num,occupation,capital-gain,capital-loss,hours-per-week,salary
0,0,1,39,77516,9,13.0,1,2174.0,0,40,0
1,0,1,50,83311,9,13.0,4,0.0,0,13,0
2,0,1,38,215646,11,9.0,6,0.0,0,40,0
3,0,1,53,234721,1,7.0,6,0.0,0,40,0
4,1,0,28,338409,9,,10,1089.664725,0,40,0


### Scale numeric columns

In [20]:
sc = MinMaxScaler()
sc.fit_transform(df["fnlwgt"].values.reshape(-1, 1))

array([[0.0443019 ],
       [0.0482376 ],
       [0.13811345],
       ...,
       [0.09482688],
       [0.12849934],
       [0.18720338]])

# 4. Iterating over rows

In [21]:
for index, row in df.head().iterrows():
    print(index, row['age'])

0 39.0
1 50.0
2 38.0
3 53.0
4 28.0


# 5. Selection and Subsetting

In [22]:
# select dataframe rows where column = a specific value
# this is analogus to sql query 'select * from df where age = 39'
df.loc[df.age == 39].head()

Unnamed: 0,sex_female,sex_male,age,fnlwgt,education,education-num,occupation,capital-gain,capital-loss,hours-per-week,salary
0,0,1,39,77516,9,13.0,1,2174.0,0,40,0
28,0,1,39,367260,11,9.0,4,0.0,0,80,0
129,0,1,39,365739,15,10.0,3,0.0,0,40,0
166,0,1,39,235485,7,12.0,4,0.0,0,42,0
297,1,0,39,157443,12,14.0,0,3464.0,0,40,0


In [23]:
# select last column of the dataframe
df.iloc[:,-1].head()

0    0
1    0
2    0
3    0
4    0
Name: salary, dtype: int64

# 6. Change data type of column

In [24]:
# change column data type
df.fnlwgt.astype(np.int64).head()

0     77516
1     83311
2    215646
3    234721
4    338409
Name: fnlwgt, dtype: int64

# 7. Merging and Concatinating 
  
Merge - merge 2 dataframes using different kinds of joins.  
Concat - concatinate dataframes along index or columns

In [25]:
raw_data = {
        'subject_id': ['1', '2', '3', '4', '5'],
        'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'], 
        'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']}
df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name'])
df_a

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches


In [26]:
raw_data = {
        'subject_id': ['4', '5', '6', '7', '8'],
        'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'], 
        'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']}
df_b = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name'])
df_b

Unnamed: 0,subject_id,first_name,last_name
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


In [27]:
raw_data = {
        'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
        'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]}
df_n = pd.DataFrame(raw_data, columns = ['subject_id','test_id'])
df_n

Unnamed: 0,subject_id,test_id
0,1,51
1,2,15
2,3,15
3,4,61
4,5,16
5,7,14
6,8,15
7,9,1
8,10,61
9,11,16


In [28]:
# join two dataframes
# axis = 1 join along columns, axis = 0 join along index
pd.concat([df_a, df_b], axis = 0)

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


In [29]:
# when joining along index, might wanna create new index
# to ignore existing indexes and make new index
pd.concat([df_a, df_b], axis = 0, ignore_index=True)

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches
5,4,Billy,Bonder
6,5,Brian,Black
7,6,Bran,Balwner
8,7,Bryce,Brice
9,8,Betty,Btisan


In [30]:
# inner join
pd.merge(df_a, df_n, on='subject_id')

Unnamed: 0,subject_id,first_name,last_name,test_id
0,1,Alex,Anderson,51
1,2,Amy,Ackerman,15
2,3,Allen,Ali,15
3,4,Alice,Aoni,61
4,5,Ayoung,Atiches,16


In [31]:
# right join
pd.merge(df_a, df_n, on='subject_id', how="outer")

Unnamed: 0,subject_id,first_name,last_name,test_id
0,1,Alex,Anderson,51
1,2,Amy,Ackerman,15
2,3,Allen,Ali,15
3,4,Alice,Aoni,61
4,5,Ayoung,Atiches,16
5,7,,,14
6,8,,,15
7,9,,,1
8,10,,,61
9,11,,,16


In [32]:
pd.merge(df_a, df_n, on='subject_id', how="left")

Unnamed: 0,subject_id,first_name,last_name,test_id
0,1,Alex,Anderson,51
1,2,Amy,Ackerman,15
2,3,Allen,Ali,15
3,4,Alice,Aoni,61
4,5,Ayoung,Atiches,16


In [33]:
pd.merge(df_a, df_n, on='subject_id', how="right")

Unnamed: 0,subject_id,first_name,last_name,test_id
0,1,Alex,Anderson,51
1,2,Amy,Ackerman,15
2,3,Allen,Ali,15
3,4,Alice,Aoni,61
4,5,Ayoung,Atiches,16
5,7,,,14
6,8,,,15
7,9,,,1
8,10,,,61
9,11,,,16
