In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from random import shuffle

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

In [3]:
df = pd.read_csv("./clean_data/fully_merged_data.csv")
df.head()

Unnamed: 0,sex,age,day,month,MEDHINC_CY,WLTHINDXCY,time_arr,TOTHH_CY,fel_misd
0,M,21,4,0,25336.0,27.0,2310,1309.0,M
1,M,21,4,0,25336.0,27.0,2310,1309.0,M
2,M,24,5,0,25427.0,30.0,16,1359.0,M
3,M,24,5,0,25427.0,30.0,16,1359.0,M
4,M,21,4,0,25336.0,27.0,2310,1309.0,M


In [4]:
need_norm = ["age","MEDHINC_CY", "WLTHINDXCY", "TOTHH_CY", "time_arr"]
norm = (df[need_norm] - df[need_norm].mean())/df[need_norm].std()
norm.mean(), norm.std(), norm.head()

(age           1.486255e-16
 MEDHINC_CY    1.008381e-16
 WLTHINDXCY   -1.062775e-16
 TOTHH_CY     -5.334798e-17
 time_arr     -4.237497e-17
 dtype: float64,
 age           1.0
 MEDHINC_CY    1.0
 WLTHINDXCY    1.0
 TOTHH_CY      1.0
 time_arr      1.0
 dtype: float64,
         age  MEDHINC_CY  WLTHINDXCY  TOTHH_CY  time_arr
 0 -1.008905   -0.796520   -0.501925 -0.448262  1.432337
 1 -1.008905   -0.796520   -0.501925 -0.448262  1.432337
 2 -0.760608   -0.789672   -0.372174 -0.411262 -1.776615
 3 -0.760608   -0.789672   -0.372174 -0.411262 -1.776615
 4 -1.008905   -0.796520   -0.501925 -0.448262  1.432337)

In [5]:
df[need_norm] = norm
df.head()

Unnamed: 0,sex,age,day,month,MEDHINC_CY,WLTHINDXCY,time_arr,TOTHH_CY,fel_misd
0,M,-1.008905,4,0,-0.79652,-0.501925,1.432337,-0.448262,M
1,M,-1.008905,4,0,-0.79652,-0.501925,1.432337,-0.448262,M
2,M,-0.760608,5,0,-0.789672,-0.372174,-1.776615,-0.411262,M
3,M,-0.760608,5,0,-0.789672,-0.372174,-1.776615,-0.411262,M
4,M,-1.008905,4,0,-0.79652,-0.501925,1.432337,-0.448262,M


In [6]:
df = df.dropna()
df.head()

Unnamed: 0,sex,age,day,month,MEDHINC_CY,WLTHINDXCY,time_arr,TOTHH_CY,fel_misd
0,M,-1.008905,4,0,-0.79652,-0.501925,1.432337,-0.448262,M
1,M,-1.008905,4,0,-0.79652,-0.501925,1.432337,-0.448262,M
2,M,-0.760608,5,0,-0.789672,-0.372174,-1.776615,-0.411262,M
3,M,-0.760608,5,0,-0.789672,-0.372174,-1.776615,-0.411262,M
4,M,-1.008905,4,0,-0.79652,-0.501925,1.432337,-0.448262,M


In [7]:
# removing the imbalanced classes
# temp1 = df[df['fel_misd'] == 2.0]
# temp2 = df[df['fel_misd'] == 1.0]
# df = pd.concat([temp1, temp2])
# df = df.drop(temp)
# temp = df.index[df['fel_misd'] == 1.0].tolist()
# df = df.drop(temp)

In [8]:
# remove null values
arr = df.index[df["fel_misd"] == ' ']
df = df.drop(arr, axis=0)
arr = df.index[df["fel_misd"] == '\xa0']
df = df.drop(arr, axis=0)
df

Unnamed: 0,sex,age,day,month,MEDHINC_CY,WLTHINDXCY,time_arr,TOTHH_CY,fel_misd
0,M,-1.008905,4,0,-0.796520,-0.501925,1.432337,-0.448262,M
1,M,-1.008905,4,0,-0.796520,-0.501925,1.432337,-0.448262,M
2,M,-0.760608,5,0,-0.789672,-0.372174,-1.776615,-0.411262,M
3,M,-0.760608,5,0,-0.789672,-0.372174,-1.776615,-0.411262,M
4,M,-1.008905,4,0,-0.796520,-0.501925,1.432337,-0.448262,M
...,...,...,...,...,...,...,...,...,...
68072,M,-0.760608,6,11,0.017538,-0.199173,-1.298210,0.370182,M
68073,F,1.308536,2,1,-1.182063,-0.804676,-0.667331,0.880784,M
68074,M,-0.926140,4,2,-0.395248,-0.155923,0.337040,1.413587,M
68075,M,-0.926140,4,2,-0.395248,-0.155923,0.337040,1.413587,M


In [9]:
df["fel_misd"].value_counts()

M    46803
F    16407
C     2194
S      240
P       50
Name: fel_misd, dtype: int64

In [69]:
df_pandas_encoded = pd.get_dummies(df, columns=['sex', 'day', 'month'], drop_first=True)
# df_pandas_encoded = pd.get_dummies(df_pandas_encoded, columns=["fel_misd"], drop_first=False)
df_pandas_encoded.head()

Unnamed: 0,age,MEDHINC_CY,WLTHINDXCY,time_arr,TOTHH_CY,fel_misd,sex_M,sex_U,day_1,day_2,...,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11
0,-1.008905,-0.79652,-0.501925,1.432337,-0.448262,M,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-1.008905,-0.79652,-0.501925,1.432337,-0.448262,M,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.760608,-0.789672,-0.372174,-1.776615,-0.411262,M,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-0.760608,-0.789672,-0.372174,-1.776615,-0.411262,M,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-1.008905,-0.79652,-0.501925,1.432337,-0.448262,M,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
m_arr = df_pandas_encoded.index[df_pandas_encoded["fel_misd"] == 'M'].tolist()
f_arr = df_pandas_encoded.index[df_pandas_encoded["fel_misd"] == 'F'].tolist()

shuffle(m_arr)
shuffle(f_arr)

In [71]:
df_pandas_encoded = df_pandas_encoded.drop(m_arr[0:44000], axis = 0)
df_pandas_encoded = df_pandas_encoded.drop(f_arr[0:14000], axis = 0)
df_pandas_encoded['fel_misd'].value_counts()

M    2803
F    2407
C    2194
S     240
P      50
Name: fel_misd, dtype: int64

In [72]:
df_pandas_encoded = df_pandas_encoded.drop("WLTHINDXCY", axis=1)

In [73]:
inp = list(df_pandas_encoded.columns)
# oup = ['fel_misd_0.0',
#      'fel_misd_1.0',
#      'fel_misd_2.0',
#      'fel_misd_3.0',
#      'fel_misd_4.0',
#      'fel_misd_5.0']
oup = ["fel_misd"]
for x in oup:
    inp.remove(x)
inp, oup

(['age',
  'MEDHINC_CY',
  'time_arr',
  'TOTHH_CY',
  'sex_M',
  'sex_U',
  'day_1',
  'day_2',
  'day_3',
  'day_4',
  'day_5',
  'day_6',
  'month_1',
  'month_2',
  'month_3',
  'month_4',
  'month_5',
  'month_6',
  'month_7',
  'month_8',
  'month_9',
  'month_10',
  'month_11'],
 ['fel_misd'])

In [74]:
x,y = df_pandas_encoded[inp], df_pandas_encoded[oup]
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.8, test_size=0.2)

In [75]:
import itertools
items = ['MEDHINC_CY','age','sex_M', 'day', 'month']
combs = []
for i in range(1, len(items)):
    combs.append(list(set(itertools.combinations(items, i))))

In [76]:
day_vals = ['day_1',
          'day_2',
          'day_3',
          'day_4',
          'day_5',
          'day_6']

month_vals = ['month_1',
              'month_2',
              'month_3',
              'month_4',
              'month_5',
              'month_6',
              'month_7',
              'month_8',
              'month_9',
              'month_10',
              'month_11']
fel_misd_vals = [ 'fel_misd_1',
                  'fel_misd_2',
                  'fel_misd_3',
                  'fel_misd_4',
                  'fel_misd_5']

best_dict = {"features": [], "score": -2**31}
for k_amt in combs:
    for ind_comb in k_amt:
        comb = list(ind_comb)
        if 'day' in comb:
            comb.remove('day')
            comb += day_vals
        if 'month' in comb:
            comb.remove('month')
            comb += month_vals
        if 'fel_misd' in comb:
            comb.remove('fel_misd')
            comb += fel_misd_vals
        
        x_subset = x_train[comb].values
        cvs = cross_val_score(linear_model.LogisticRegression(multi_class='ovr'), x_subset, y_train)
        
        if cvs.mean() > best_dict["score"]:
            best_dict["features"] = comb
            best_dict["score"] = cvs.mean()
best_dict

{'features': ['age',
  'sex_M',
  'day_1',
  'day_2',
  'day_3',
  'day_4',
  'day_5',
  'day_6',
  'month_1',
  'month_2',
  'month_3',
  'month_4',
  'month_5',
  'month_6',
  'month_7',
  'month_8',
  'month_9',
  'month_10',
  'month_11'],
 'score': 0.4038992688870837}

In [77]:
lr = linear_model.LogisticRegression(multi_class="ovr")

In [78]:
best_dict["features"]

['age',
 'sex_M',
 'day_1',
 'day_2',
 'day_3',
 'day_4',
 'day_5',
 'day_6',
 'month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'month_7',
 'month_8',
 'month_9',
 'month_10',
 'month_11']

In [79]:
x_train = x_train[best_dict["features"]]
x_test = x_test[best_dict["features"]]

lr.fit(x_train, y_train)

In [80]:
y_pred = lr.predict(x_test)
print(f"Accuracy train: {lr.score(x_train,y_train)}")
print(f"Accuracy Test: {lr.score(x_test,y_test)}")

Accuracy train: 0.4066612510154346
Accuracy Test: 0.40025990903183883


In [81]:
confusion_matrix(y_test, y_pred)

array([[104, 118, 202,   0,   0],
       [ 65, 193, 246,   0,   1],
       [ 77, 163, 312,   0,   3],
       [  4,   4,   2,   0,   0],
       [ 10,  15,  13,   0,   7]])

In [82]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           C       0.40      0.25      0.30       424
           F       0.39      0.38      0.39       505
           M       0.40      0.56      0.47       555
           P       0.00      0.00      0.00        10
           S       0.64      0.16      0.25        45

    accuracy                           0.40      1539
   macro avg       0.37      0.27      0.28      1539
weighted avg       0.40      0.40      0.39      1539

