In [7]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

### Adult Census

In [14]:
df = pd.read_csv('datasets/raw/adult.csv')

In [15]:
df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K
7,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20,United-States,>50K
8,68,Federal-gov,422013,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,3683,40,United-States,<=50K
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,?,>50K


In [16]:
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [17]:
# for col in ['workclass', 'occupation', 'native.country']:
#     df[col].fillna(df[col].mode()[0], inplace=True)

In [18]:
# convert target to binary
le = preprocessing.LabelEncoder()
df['income'] = le.fit_transform(df['income'])

### For clean dataset creation

In [19]:
categorical = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
for feature in categorical:
    le = preprocessing.LabelEncoder()
    df[feature] = le.fit_transform(df[feature])

In [20]:
df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,0,77053,11,9,6,0,1,4,0,0,4356,40,39,0
1,82,4,132870,11,9,6,4,1,4,0,0,4356,18,39,0
2,66,0,186061,15,10,6,0,4,2,0,0,4356,40,39,0
3,54,4,140359,5,4,0,7,4,4,0,0,3900,40,39,0
4,41,4,264663,15,10,5,10,3,4,0,0,3900,40,39,0


In [21]:
scaler = StandardScaler()
# cols = pd.Series(['age', 'fnlwgt', 'capital.loss', 'hours.per.week']) # scale only continous
cols = pd.Series(df.columns[df.columns != 'income'].tolist()) # scale all but target
df[cols] = pd.DataFrame(scaler.fit_transform(df[cols]), columns = cols)
# df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)

In [22]:
df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,3.769612,-2.65732,-1.067997,0.181332,-0.42006,2.24948,-1.554283,-0.277805,0.393668,-1.422331,-0.14592,10.593507,-0.035429,0.291569,0
1,3.183112,0.09005,-0.539169,0.181332,-0.42006,2.24948,-0.608387,-0.277805,0.393668,-1.422331,-0.14592,10.593507,-1.817204,0.291569,0
2,2.01011,-2.65732,-0.03522,1.214869,-0.03136,2.24948,-1.554283,1.589322,-1.962621,-1.422331,-0.14592,10.593507,-0.035429,0.291569,0
3,1.130359,0.09005,-0.468215,-1.368974,-2.363558,-1.734058,0.101036,1.589322,0.393668,-1.422331,-0.14592,9.461864,-0.035429,0.291569,0
4,0.177296,0.09005,0.709482,1.214869,-0.03136,1.585557,0.810458,0.966947,0.393668,-1.422331,-0.14592,9.461864,-0.035429,0.291569,0


In [23]:
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [24]:
df.to_csv('datasets/clean/adult_scaled.csv')

### For model training

In [7]:
X = df.drop(['income'], axis=1)
y = df['income']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [9]:
categorical = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
for feature in categorical:
    le = preprocessing.LabelEncoder()
    X_train[feature] = le.fit_transform(X_train[feature])
    X_test[feature] = le.transform(X_test[feature])

In [10]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X.columns)

In [11]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Logistic Regression accuracy score with all the features: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Logistic Regression accuracy score with all the features: 0.8204


### Home Price Prediction

In [41]:
data = pd.read_csv('datasets/raw/house_price.csv')

In [42]:
data.head(10)

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA
5,2014-05-02 00:00:00,490000.0,2.0,1.0,880,6380,1.0,0,0,3,880,0,1938,1994,522 NE 88th St,Seattle,WA 98115,USA
6,2014-05-02 00:00:00,335000.0,2.0,2.0,1350,2560,1.0,0,0,3,1350,0,1976,0,2616 174th Ave NE,Redmond,WA 98052,USA
7,2014-05-02 00:00:00,482000.0,4.0,2.5,2710,35868,2.0,0,0,3,2710,0,1989,0,23762 SE 253rd Pl,Maple Valley,WA 98038,USA
8,2014-05-02 00:00:00,452500.0,3.0,2.5,2430,88426,1.0,0,0,4,1570,860,1985,0,46611-46625 SE 129th St,North Bend,WA 98045,USA
9,2014-05-02 00:00:00,640000.0,4.0,2.0,1520,6200,1.5,0,0,3,1520,0,1945,2010,6811 55th Ave NE,Seattle,WA 98115,USA


In [43]:
data['price']     = data['price'].astype('int64')
data['bedrooms']  = data['bedrooms'].astype('int64')
data['bathrooms'] = data['bathrooms'].astype('int64')
data['floors']    = data['floors'].astype('int64')
data['street']    = data['street'].astype('string')
data['city']      = data['city'].astype('string')
data['statezip']  = data['statezip'].astype('string')
data['country']   = data['country'].astype('string')

In [44]:
data = data[data['price'] != 0]

In [45]:
data = data.drop(['date', 'street', 'statezip', 'country','sqft_above'], axis = 1)

In [47]:
# convert target to binary
le = preprocessing.LabelEncoder()
data['city'] = le.fit_transform(data['city'])

In [48]:
data.head(10)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_basement,yr_built,yr_renovated,city
0,313000,3,1,1340,7912,1,0,0,3,0,1955,2005,36
1,2384000,5,2,3650,9050,2,0,4,5,280,1921,0,35
2,342000,3,2,1930,11947,1,0,0,4,0,1966,0,18
3,420000,3,2,2000,8030,1,0,0,4,1000,1963,0,3
4,550000,4,2,1940,10500,1,0,0,4,800,1976,1992,31
5,490000,2,1,880,6380,1,0,0,3,0,1938,1994,35
6,335000,2,2,1350,2560,1,0,0,3,0,1976,0,31
7,482000,4,2,2710,35868,2,0,0,3,0,1989,0,21
8,452500,3,2,2430,88426,1,0,0,4,860,1985,0,27
9,640000,4,2,1520,6200,1,0,0,3,0,1945,2010,35


In [49]:
# dataset normalization
columns = data.columns
scaler = preprocessing.MinMaxScaler(feature_range = (0, 1))
data = pd.DataFrame(scaler.fit_transform(data), columns = columns)

### For clean dataset creation

In [57]:
data.head(5)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_basement,yr_built,yr_renovated,city
0,0.011481,0.333333,0.125,0.073652,0.006775,0.0,0.0,0.0,0.5,0.0,0.482456,0.995531,0.837209
1,0.089391,0.555556,0.25,0.249051,0.007835,0.5,0.0,1.0,1.0,0.058091,0.184211,0.0,0.813953
2,0.012572,0.333333,0.25,0.118451,0.010534,0.0,0.0,0.0,0.75,0.0,0.578947,0.0,0.418605
3,0.015507,0.333333,0.25,0.123766,0.006885,0.0,0.0,0.0,0.75,0.207469,0.552632,0.0,0.069767
4,0.020397,0.444444,0.25,0.11921,0.009186,0.0,0.0,0.0,0.75,0.165975,0.666667,0.989076,0.72093


In [58]:
data.to_csv('datasets/clean/house_price.csv')

### For model training

In [52]:
X = data.drop("price", axis=1)
y = pd.DataFrame(data["price"])

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [60]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)

print("Root Mean Squared Error: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score: ", r2_score(y_test, y_pred))

Root Mean Squared Error:  0.0087142072843099
R2 Score:  0.5844905131307716
