In [39]:
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder

In [5]:
random.seed(321)
n = 1000
age = random.choices(range(25,45), k=n)
gender = random.choices(["M","F"], k=n)
education = random.choices(["BS","MS","PhD"], k=n, weights=[0.6,0.3,0.1])
location = random.choices(["US","EU"], k=n)

owner = [True if education[i] in ["MS","PhD"] and age[i]>35 else False for i in range(n)]
income = [random.choice(range(120000,150000,1000)) if loc=="US" else random.choice(range(50000,90000,1000)) for loc in location]
price = [random.choice(range(500000,600000,10000)) if 12000<=inc<=15000 else random.choice(range(300000,400000,10000)) for inc in income]
home_value = [price[i] if owner[i]==True else np.nan for i in range(n)]

df = pd.DataFrame({"age":age,"gender":gender,"education":education,"location":location,"income":income,"owner":owner,"home_value":home_value})
df

Unnamed: 0,age,gender,education,location,income,owner,home_value
0,30,F,BS,EU,55000,False,
1,27,M,BS,EU,71000,False,
2,36,M,MS,EU,75000,True,380000.0
3,33,F,BS,US,142000,False,
4,44,F,MS,EU,80000,True,360000.0
...,...,...,...,...,...,...,...
995,43,F,PhD,US,143000,True,350000.0
996,39,F,BS,EU,74000,False,
997,41,M,BS,US,144000,False,
998,40,M,MS,US,148000,True,360000.0


In [34]:
df_gender = pd.get_dummies(df.gender, prefix='gender')
df_education = pd.get_dummies(df.education, prefix='education')
df_location = pd.get_dummies(df.location, prefix='location')
df_owner = pd.get_dummies(df.owner, prefix='owner')
df = pd.concat([df, df_gender, df_education, df_location, df_owner], axis=1) # df.reset_index(drop=True) if not matching index
df

Unnamed: 0,age,gender,education,location,income,owner,home_value,gender_F,gender_M,education_BS,...,location_US,gender_F.1,gender_M.1,education_BS.1,education_MS,education_PhD,location_EU,location_US.1,owner_0,owner_1
0,30,F,BS,EU,55000,0,,1,0,1,...,0,1,0,1,0,0,1,0,1,0
1,27,M,BS,EU,71000,0,,0,1,1,...,0,0,1,1,0,0,1,0,1,0
2,36,M,MS,EU,75000,1,380000.0,0,1,0,...,0,0,1,0,1,0,1,0,0,1
3,33,F,BS,US,142000,0,,1,0,1,...,1,1,0,1,0,0,0,1,1,0
4,44,F,MS,EU,80000,1,360000.0,1,0,0,...,0,1,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,43,F,PhD,US,143000,1,350000.0,1,0,0,...,1,1,0,0,0,1,0,1,0,1
996,39,F,BS,EU,74000,0,,1,0,1,...,0,1,0,1,0,0,1,0,1,0
997,41,M,BS,US,144000,0,,0,1,1,...,1,0,1,1,0,0,0,1,1,0
998,40,M,MS,US,148000,1,360000.0,0,1,0,...,1,0,1,0,1,0,0,1,0,1


In [51]:
cols = df_gender.columns.append(df_education.columns).append(df_location.columns).append(df_owner.columns).tolist()
cols.extend(['age'])

df['age'] = df.age.map(lambda x: (x - df.age.min())/(df.age.max() - df.age.min()))
df

X = df[cols]
y = df[['income']]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
clf = LogisticRegression()
clf.fit(X_train, Y_train)

  return f(**kwargs)


LogisticRegression()

In [52]:
train_error = (clf.predict(X_train).reshape([len(X_train), 1]) - Y_train).sum()
test_error = (clf.predict(X_test).reshape([len(X_test), 1]) - Y_test).sum()
train_error, test_error

(income   -38000
 dtype: int64,
 income    74000
 dtype: int64)

In [53]:
clf.score(X_train, Y_train), clf.score(X_test, Y_test)

(0.09571428571428571, 0.03333333333333333)

In [59]:
clf.coef_

(70, 17)

makes sense. age and education should be predictors.

In [55]:
confusion_matrix(Y_train, clf.predict(X_train))

array([[ 2,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 1,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  4,  0],
       [ 0,  0,  0, ...,  0, 11,  0],
       [ 0,  0,  0, ...,  0,  2,  0]])

In [28]:
confusion_matrix(Y_test, clf.predict(X_test))

array([[248,   0],
       [  0,  52]])

In [30]:
print(classification_report(Y_train, clf.predict(X_train)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       567
           1       1.00      1.00      1.00       133

    accuracy                           1.00       700
   macro avg       1.00      1.00      1.00       700
weighted avg       1.00      1.00      1.00       700



In [31]:
print(classification_report(Y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       248
           1       1.00      1.00      1.00        52

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300

