In [1]:
import numpy as np
import pandas as pd

In [2]:
headers=['age','workclass','fnlwgt','education','educational-num','ms','occupation','relation','race','sex','capital-gain','capital-loss','hours-per-week','country','salary']
data=pd.read_csv("adult.data",names=headers,header=None)

In [3]:
data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,ms,occupation,relation,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
data.count()

age                32561
workclass          32561
fnlwgt             32561
education          32561
educational-num    32561
ms                 32561
occupation         32561
relation           32561
race               32561
sex                32561
capital-gain       32561
capital-loss       32561
hours-per-week     32561
country            32561
salary             32561
dtype: int64

Check the Data Types present in data

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age                32561 non-null int64
workclass          32561 non-null object
fnlwgt             32561 non-null int64
education          32561 non-null object
educational-num    32561 non-null int64
ms                 32561 non-null object
occupation         32561 non-null object
relation           32561 non-null object
race               32561 non-null object
sex                32561 non-null object
capital-gain       32561 non-null int64
capital-loss       32561 non-null int64
hours-per-week     32561 non-null int64
country            32561 non-null object
salary             32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


identifying the data type category
e.g.
Numeric- age,aducation
categorical - sex,country,marital-status
ordinal -low/medium/high

Machine earn can not work directly on categorical data, to do so, categorical has to be converted into dummies(or numbers)

Identify unique rows in each categorical data columns which has object datatype 

In [6]:
data['sex'].unique()

array([' Male', ' Female'], dtype=object)

In [7]:
data['education'].unique()

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)

make a generic check for all object type coulums 

In [8]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'ms',
       'occupation', 'relation', 'race', 'sex', 'capital-gain', 'capital-loss',
       'hours-per-week', 'country', 'salary'],
      dtype='object')

In [9]:
for col in data.columns:
    if data[col].dtypes == 'object':
        unique_category = len(data[col].unique())
        print("Column {0} has {1} unique category".format(col,unique_category))

Column workclass has 9 unique category
Column education has 16 unique category
Column ms has 7 unique category
Column occupation has 15 unique category
Column relation has 6 unique category
Column race has 5 unique category
Column sex has 2 unique category
Column country has 42 unique category
Column salary has 2 unique category


only Country has 42 unique categories which will be very difficult to convert into dummies, so lets think in different way

Check which contry has maimum entry, make other country as others category

In [10]:
print(data['country'].value_counts().sort_values(ascending=False).head(10))

 United-States    29170
 Mexico             643
 ?                  583
 Philippines        198
 Germany            137
 Canada             121
 Puerto-Rico        114
 El-Salvador        106
 India              100
 Cuba                95
Name: country, dtype: int64


In [11]:
data['country'] = ['United-States' if x.lstrip() == 'United-States' else 'others' for x in data['country']]

In [12]:
print(data['country'].value_counts().sort_values(ascending=False))

United-States    29170
others            3391
Name: country, dtype: int64


To Covert all categories into dummy

In [13]:
def dummification(data,dummy_col):
    for x in dummy_col:
        dummies=pd.get_dummies(data[x],prefix=x,dummy_na=False)
        data=data.drop(x,1)
        data=pd.concat([data,dummies],axis=1)
    return data

In [14]:
dummy_col=['workclass','education','ms','occupation','relation','race','sex','country']
target_data=data['salary']
data=data.drop('salary',axis=1)
data=dummification(data,dummy_col)

In [15]:
data.head(5)

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,relation_ Wife,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Female,sex_ Male,country_United-States,country_others
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,1,0,1,1,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,1,0,1,1,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,1,1,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
4,28,338409,13,0,0,40,0,0,0,0,...,1,0,0,1,0,0,1,0,0,1


Now check how much data is missing from all the categories

In [18]:
data.isnull().sum().sort_values(ascending=False).head()

country_others          0
education_ Bachelors    0
education_ 1st-4th      0
education_ 5th-6th      0
education_ 7th-8th      0
dtype: int64

Here we can't see any missing data but for missing data sklearn has simple solution i.e 'Imputation'

In [21]:
from sklearn.preprocessing import Imputer
imp=Imputer(missing_values='NaN', strategy='median',axis=0)
imp.fit(data)
data=pd.DataFrame(data=imp.transform(data),columns=data.columns)

In [22]:
data.isnull().sum().sort_values(ascending=False).head()

country_others          0
education_ Bachelors    0
education_ 1st-4th      0
education_ 5th-6th      0
education_ 7th-8th      0
dtype: int64

Nothing has changes in terms of improvng missing data, because it has alredy been observed no missing data...Luckily 

Data Exploration: Outlier Detection Using Tukey IQR

In [33]:
def find_outlier(X):
    q1 = np.percentile(X,25)
    q3 = np.percentile(X,75)
    iqr = q3-q1
    floor = q1 - 1.5 * iqr
    ceiling = q3+ 1.5 * iqr
    outlier_indices = list(X.index[(X < floor) | (X > ceiling )]) 
    outlier_value = list(X[outlier_indices])
    return outlier_indices,outlier_value

In [34]:
tucky_indices,tucky_value = find_outlier(data['age'])

In [35]:
print(np.sort(tucky_value))

[79. 79. 79. 79. 79. 79. 79. 79. 79. 79. 79. 79. 79. 79. 79. 79. 79. 79.
 79. 79. 79. 79. 80. 80. 80. 80. 80. 80. 80. 80. 80. 80. 80. 80. 80. 80.
 80. 80. 80. 80. 80. 80. 80. 80. 81. 81. 81. 81. 81. 81. 81. 81. 81. 81.
 81. 81. 81. 81. 81. 81. 81. 81. 81. 81. 82. 82. 82. 82. 82. 82. 82. 82.
 82. 82. 82. 82. 83. 83. 83. 83. 83. 83. 84. 84. 84. 84. 84. 84. 84. 84.
 84. 84. 85. 85. 85. 86. 87. 88. 88. 88. 90. 90. 90. 90. 90. 90. 90. 90.
 90. 90. 90. 90. 90. 90. 90. 90. 90. 90. 90. 90. 90. 90. 90. 90. 90. 90.
 90. 90. 90. 90. 90. 90. 90. 90. 90. 90. 90. 90. 90. 90. 90. 90. 90.]


Intrection Between the features

using PolynomialFeature in sklearn.preprocessing to create two-way interactions for all features

In [37]:
from itertools import combinations
from sklearn.preprocessing import PolynomialFeatures

In [63]:
def add_interections(df):
    combos=list(combinations(list(df.columns),2))
    column_names=list(df.columns)+['_'.join(x) for x in combos]
    #find Interections
    poly = PolynomialFeatures(interaction_only=True,include_bias=False)
    df = poly.fit_transform(df)
    df = pd.DataFrame(df)
    df.columns = column_names
    
    #remove Interection terms with all zero
    point_indicies = [i for i,x in enumerate(list((df == 0).all())) if x]
    df = df.drop(df.columns[point_indicies],axis=1)
    return df

In [64]:
X = add_interections(data)
print(X.head(5))

    age    fnlwgt  educational-num  capital-gain  capital-loss  \
0  39.0   77516.0             13.0        2174.0           0.0   
1  50.0   83311.0             13.0           0.0           0.0   
2  38.0  215646.0              9.0           0.0           0.0   
3  53.0  234721.0              7.0           0.0           0.0   
4  28.0  338409.0             13.0           0.0           0.0   

   hours-per-week  workclass_ ?  workclass_ Federal-gov  workclass_ Local-gov  \
0            40.0           0.0                     0.0                   0.0   
1            13.0           0.0                     0.0                   0.0   
2            40.0           0.0                     0.0                   0.0   
3            40.0           0.0                     0.0                   0.0   
4            40.0           0.0                     0.0                   0.0   

   workclass_ Never-worked            ...             \
0                      0.0            ...              
1   

Dimentionality Reduction using PCA(Principle Component analysis)

TypeError: count() takes exactly one argument (0 given)