In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

In [3]:
df = pd.read_csv("./clean_data/updated_fully_merged_data.csv", dtype=np.float64)
df.head()

Unnamed: 0,sex,age,day,month,MEDHINC_CY,WLTHINDXCY,fel_misd
0,1.0,21.0,4.0,0.0,25336.0,27.0,2.0
1,1.0,21.0,4.0,0.0,25336.0,27.0,2.0
2,1.0,24.0,5.0,0.0,25427.0,30.0,2.0
3,1.0,24.0,5.0,0.0,25427.0,30.0,2.0
4,1.0,21.0,4.0,0.0,25336.0,27.0,2.0


In [4]:
need_norm = ["age","MEDHINC_CY", "WLTHINDXCY"]
norm = (df[need_norm] - df[need_norm].mean())/df[need_norm].std()
norm.mean(), norm.std(), norm.head()

(age           1.486255e-16
 MEDHINC_CY    1.008381e-16
 WLTHINDXCY   -1.062775e-16
 dtype: float64,
 age           1.0
 MEDHINC_CY    1.0
 WLTHINDXCY    1.0
 dtype: float64,
         age  MEDHINC_CY  WLTHINDXCY
 0 -1.008905   -0.796520   -0.501925
 1 -1.008905   -0.796520   -0.501925
 2 -0.760608   -0.789672   -0.372174
 3 -0.760608   -0.789672   -0.372174
 4 -1.008905   -0.796520   -0.501925)

In [5]:
df[need_norm] = norm
df.head()

Unnamed: 0,sex,age,day,month,MEDHINC_CY,WLTHINDXCY,fel_misd
0,1.0,-1.008905,4.0,0.0,-0.79652,-0.501925,2.0
1,1.0,-1.008905,4.0,0.0,-0.79652,-0.501925,2.0
2,1.0,-0.760608,5.0,0.0,-0.789672,-0.372174,2.0
3,1.0,-0.760608,5.0,0.0,-0.789672,-0.372174,2.0
4,1.0,-1.008905,4.0,0.0,-0.79652,-0.501925,2.0


In [6]:
df = df.dropna()
df.head()

Unnamed: 0,sex,age,day,month,MEDHINC_CY,WLTHINDXCY,fel_misd
0,1.0,-1.008905,4.0,0.0,-0.79652,-0.501925,2.0
1,1.0,-1.008905,4.0,0.0,-0.79652,-0.501925,2.0
2,1.0,-0.760608,5.0,0.0,-0.789672,-0.372174,2.0
3,1.0,-0.760608,5.0,0.0,-0.789672,-0.372174,2.0
4,1.0,-1.008905,4.0,0.0,-0.79652,-0.501925,2.0


In [7]:
# removing the imbalanced classes
# temp1 = df[df['fel_misd'] == 2.0]
# temp2 = df[df['fel_misd'] == 1.0]
# df = pd.concat([temp1, temp2])
# df = df.drop(temp)
# temp = df.index[df['fel_misd'] == 1.0].tolist()
# df = df.drop(temp)

In [8]:
df["fel_misd"].value_counts()

2.0    46803
1.0    16407
5.0     2233
4.0     2194
3.0      240
0.0       50
Name: fel_misd, dtype: int64

In [9]:
df_pandas_encoded = pd.get_dummies(df, columns=['sex', 'day', 'month', 'fel_misd'], drop_first=True)
# df_pandas_encoded = pd.get_dummies(df_pandas_encoded, columns=["fel_misd"], drop_first=False)
df_pandas_encoded.head()

Unnamed: 0,age,MEDHINC_CY,WLTHINDXCY,sex_1.0,day_1.0,day_2.0,day_3.0,day_4.0,day_5.0,day_6.0,...,month_7.0,month_8.0,month_9.0,month_10.0,month_11.0,fel_misd_1.0,fel_misd_2.0,fel_misd_3.0,fel_misd_4.0,fel_misd_5.0
0,-1.008905,-0.79652,-0.501925,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,-1.008905,-0.79652,-0.501925,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,-0.760608,-0.789672,-0.372174,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,-0.760608,-0.789672,-0.372174,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,-1.008905,-0.79652,-0.501925,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [10]:
df_pandas_encoded = df_pandas_encoded.drop("WLTHINDXCY", axis=1)

In [11]:
inp = list(df_pandas_encoded.columns)
# oup = ['fel_misd_0.0',
#      'fel_misd_1.0',
#      'fel_misd_2.0',
#      'fel_misd_3.0',
#      'fel_misd_4.0',
#      'fel_misd_5.0']
oup = ["MEDHINC_CY"]
for x in oup:
    inp.remove(x)
inp, oup

(['age',
  'sex_1.0',
  'day_1.0',
  'day_2.0',
  'day_3.0',
  'day_4.0',
  'day_5.0',
  'day_6.0',
  'month_1.0',
  'month_2.0',
  'month_3.0',
  'month_4.0',
  'month_5.0',
  'month_6.0',
  'month_7.0',
  'month_8.0',
  'month_9.0',
  'month_10.0',
  'month_11.0',
  'fel_misd_1.0',
  'fel_misd_2.0',
  'fel_misd_3.0',
  'fel_misd_4.0',
  'fel_misd_5.0'],
 ['MEDHINC_CY'])

In [20]:
x,y = df_pandas_encoded[inp], df_pandas_encoded[oup]
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.8, test_size=0.2)

In [21]:
import itertools
items = ['fel_misd','age','sex_1.0', 'day', 'month']
combs = []
for i in range(1, len(items)):
    combs.append(list(set(itertools.combinations(items, i))))

In [22]:
day_vals = ['day_1.0',
          'day_2.0',
          'day_3.0',
          'day_4.0',
          'day_5.0',
          'day_6.0']

month_vals = ['month_1.0',
              'month_2.0',
              'month_3.0',
              'month_4.0',
              'month_5.0',
              'month_6.0',
              'month_7.0',
              'month_8.0',
              'month_9.0',
              'month_10.0',
              'month_11.0']
fel_misd_vals = [ 'fel_misd_1.0',
                  'fel_misd_2.0',
                  'fel_misd_3.0',
                  'fel_misd_4.0',
                  'fel_misd_5.0']

best_dict = {"features": [], "score": -2**31}
for k_amt in combs:
    for ind_comb in k_amt:
        comb = list(ind_comb)
        if 'day' in comb:
            comb.remove('day')
            comb += day_vals
        if 'month' in comb:
            comb.remove('month')
            comb += month_vals
        if 'fel_misd' in comb:
            comb.remove('fel_misd')
            comb += fel_misd_vals
        
        x_subset = x_train[comb].values
        cvs = cross_val_score(linear_model.LinearRegression(), x_subset, y_train)
        
        if cvs.mean() > best_dict["score"]:
            best_dict["features"] = comb
            best_dict["score"] = cvs.mean()
best_dict

{'features': ['age',
  'sex_1.0',
  'day_1.0',
  'day_2.0',
  'day_3.0',
  'day_4.0',
  'day_5.0',
  'day_6.0',
  'fel_misd_1.0',
  'fel_misd_2.0',
  'fel_misd_3.0',
  'fel_misd_4.0',
  'fel_misd_5.0'],
 'score': 0.003450854704518158}

In [23]:
lr = linear_model.LinearRegression()

In [24]:
best_dict["features"]

['age',
 'sex_1.0',
 'day_1.0',
 'day_2.0',
 'day_3.0',
 'day_4.0',
 'day_5.0',
 'day_6.0',
 'fel_misd_1.0',
 'fel_misd_2.0',
 'fel_misd_3.0',
 'fel_misd_4.0',
 'fel_misd_5.0']

In [25]:
x_train = x_train[['sex_1.0']]
x_test = x_test[['sex_1.0']]

lr.fit(x_train, y_train)

In [26]:
y_pred = lr.predict(x_test)
print(f"R2 train: {lr.score(x_train,y_train)}")
print(f"R2 Test: {lr.score(x_test,y_test)}")

R2 train: 0.0005411209195155253
R2 Test: 0.0007503484372012315
