In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, explained_variance_score
import numpy as np
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from lightgbm import LGBMClassifier
import datetime as dt

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
#inputting train data
df_train = pd.read_csv('train.csv.xlt', low_memory=False)

In [3]:
#checking for dimensions of the train set
df_train.shape

(1143, 75)

In [4]:
#checking for number of nan values per column
# df_train.isna().sum()

In [5]:
# df_train['cons_alcohol'].dropna

In [6]:
# df_train.isna().sum()

In [7]:
df_train.drop(['med_u5_deaths', 'med_expenses_sp_ep'], axis=1)

Unnamed: 0,surveyid,village,survey_date,femaleres,age,married,children,hhsize,edu,hh_children,...,given_mpesa,amount_given_mpesa,received_mpesa,amount_received_mpesa,net_mpesa,saved_mpesa,amount_saved_mpesa,early_survey,depressed,day_of_week
0,926,91,23-Nov-61,1,28.0,1,4,6,10,0,...,0,0.0,0,0.000000,0.000000,1,0.000000,0,0,5
1,747,57,24-Oct-61,1,23.0,1,3,5,8,0,...,0,0.0,1,4.804611,4.804611,0,0.000000,0,1,3
2,1190,115,05-Oct-61,1,22.0,1,3,5,9,0,...,0,0.0,0,8.007685,8.007685,1,0.000000,0,0,5
3,1065,97,23-Sep-61,1,27.0,1,2,4,10,2,...,0,0.0,0,0.000000,0.000000,1,1.249199,0,0,0
4,806,42,12-Sep-61,0,59.0,0,4,6,10,4,...,0,0.0,0,0.000000,0.000000,0,0.000000,0,0,3
5,483,25,08-Sep-61,1,35.0,1,6,8,10,6,...,0,0.0,0,0.000000,0.000000,0,0.000000,0,0,6
6,849,130,31-Oct-61,0,34.0,0,1,3,9,1,...,0,0.0,0,0.000000,0.000000,0,0.000000,0,1,3
7,1386,72,03-Sep-61,1,21.0,1,2,4,10,2,...,0,0.0,0,0.000000,0.000000,0,0.000000,0,0,1
8,930,195,12-Nov-61,1,32.0,1,7,9,9,7,...,0,0.0,0,0.000000,0.000000,0,0.000000,0,0,1
9,390,33,28-Nov-61,1,29.0,1,4,6,10,0,...,0,0.0,0,0.000000,0.000000,0,0.000000,0,0,3


In [8]:
df_train = df_train.dropna()

In [9]:
df_train.shape

(12, 75)

In [10]:
df_train.head()

Unnamed: 0,surveyid,village,survey_date,femaleres,age,married,children,hhsize,edu,hh_children,...,given_mpesa,amount_given_mpesa,received_mpesa,amount_received_mpesa,net_mpesa,saved_mpesa,amount_saved_mpesa,early_survey,depressed,day_of_week
22,137,9,21-Jul-60,1,34.0,1,3,5,10,3,...,0,0.0,0,0.0,0.0,0,0.0,1,0,5
43,122,22,16-Nov-61,1,18.0,1,2,3,9,2,...,0,0.0,0,0.0,0.0,0,0.0,0,0,5
74,277,57,14-Nov-61,1,30.0,1,6,8,7,6,...,0,4.804611,1,16.015369,11.210759,0,0.0,0,1,3
127,1372,228,23-Oct-61,1,27.0,1,4,7,10,4,...,0,0.0,0,0.0,0.0,1,0.800768,0,0,2
269,653,42,03-Oct-61,1,33.0,1,2,4,7,2,...,0,0.0,0,0.0,0.0,0,0.0,0,0,3


In [11]:
df_train["survey_date"] = pd.to_datetime(df_train["survey_date"],infer_datetime_format=True)
df_train['survey_month'] = df_train.survey_date.dt.month
df_train['survey_dayof_year'] = df_train.survey_date.dt.dayofyear
df_train["survey_dayof_week"] = df_train["survey_date"].dt.dayofweek #change the full date to day of week

In [12]:
df_train.columns

Index(['surveyid', 'village', 'survey_date', 'femaleres', 'age', 'married',
       'children', 'hhsize', 'edu', 'hh_children', 'hh_totalmembers',
       'cons_nondurable', 'asset_livestock', 'asset_durable', 'asset_phone',
       'asset_savings', 'asset_land_owned_total', 'asset_niceroof',
       'cons_allfood', 'cons_ownfood', 'cons_alcohol', 'cons_tobacco',
       'cons_med_total', 'cons_med_children', 'cons_ed', 'cons_social',
       'cons_other', 'ent_wagelabor', 'ent_ownfarm', 'ent_business',
       'ent_nonagbusiness', 'ent_employees', 'ent_nonag_revenue',
       'ent_nonag_flowcost', 'ent_farmrevenue', 'ent_farmexpenses',
       'ent_animalstockrev', 'ent_total_cost', 'fs_adskipm_often',
       'fs_adwholed_often', 'fs_chskipm_often', 'fs_chwholed_often', 'fs_meat',
       'fs_enoughtom', 'fs_sleephun', 'med_expenses_hh_ep',
       'med_expenses_sp_ep', 'med_expenses_child_ep',
       'med_portion_sickinjured', 'med_port_sick_child', 'med_afford_port',
       'med_sickdays_hhave

In [13]:
# pairwise correlation
df_train.corr()["depressed"]

surveyid                  -0.150998
village                    0.062118
femaleres                 -0.674200
age                        0.130225
married                         NaN
children                   0.536330
hhsize                     0.350000
edu                       -0.135582
hh_children                0.536330
hh_totalmembers            0.350000
cons_nondurable            0.067717
asset_livestock            0.667591
asset_durable              0.560066
asset_phone                0.396973
asset_savings              0.302434
asset_land_owned_total     0.083598
asset_niceroof                  NaN
cons_allfood               0.063593
cons_ownfood              -0.003055
cons_alcohol               0.636751
cons_tobacco              -0.134840
cons_med_total             0.040640
cons_med_children          0.173419
cons_ed                   -0.114307
cons_social               -0.294886
cons_other                -0.086669
ent_wagelabor             -0.258199
ent_ownfarm               -0

In [14]:
# df_train.isna().any() #check for which column has missing values

In [15]:
df_train["med_u5_deaths"].fillna(0,inplace=True)
df_train["med_expenses_sp_ep"].fillna(0,inplace=True)
df_train["edu"].fillna(df_train["edu"].mean(), inplace=True)
df_train["age"] = df_train["age"].apply(pd.to_numeric)

df_train["age"].fillna(df_train["age"].mean(), inplace=True)

In [16]:
y_train = df_train.depressed
y_train.head(5)

22     0
43     0
74     1
127    0
269    0
Name: depressed, dtype: int64

In [17]:
#choosing initial features basing on strength of corellation with target feature
X_train = df_train[['edu','age']]

In [18]:
# bins = [0, 5, 13, 20, 35, 50, 75, 100]
# X_train['age'] = pd.cut(X_train['age'], bins)

In [19]:
# from sklearn.model_selection import GridSearchCV

# param_grid = [
# {'n_estimators':[100,1000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,8]},
# ]
# # {'n_estimators':[500,1000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8]},
# # {'bootstrap':[False], 'n_estimators':[500,1000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8]}

# forest_reg = RandomForestClassifier()

# grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

# grid_search.fit(X_train,y_train)

In [20]:
# grid_search.best_params_

In [21]:
# grid_search.best_estimator_

In [22]:
# model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#             max_depth=4, max_features='auto', max_leaf_nodes=None,
#             min_impurity_decrease=0.0, min_impurity_split=None,
#             min_samples_leaf=1, min_samples_split=2,
#             min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
#             oob_score=False, random_state=None, verbose=0,
#             warm_start=False)

In [23]:
X_train.head()

Unnamed: 0,edu,age
22,10,34.0
43,9,18.0
74,7,30.0
127,10,27.0
269,7,33.0


In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
# model = KNeighborsClassifier(n_neighbors=3)
# model = BaggingClassifier()

In [25]:
model = RandomForestClassifier()

In [26]:
model.fit(X_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [27]:
prediction_train_set = model.predict(X_train)

In [28]:
# print (mean_absolute_error(y_train,prediction_train_set)) #best is 0.0

In [29]:
# print(explained_variance_score(y_train,prediction_train_set)) #best is 1.0

In [30]:
df_test = pd.read_csv('test.csv.xlt', low_memory=False)

In [31]:
df_test.head(5)

Unnamed: 0,surveyid,village,survey_date,femaleres,age,married,children,hhsize,edu,hh_children,...,given_mpesa,amount_given_mpesa,received_mpesa,amount_received_mpesa,net_mpesa,saved_mpesa,amount_saved_mpesa,early_survey,depressed,day_of_week
0,901,181,14-Nov-61,1,21,0,3,4,10,3,...,0,0.0,0,0.0,0.0,1,0.0,0,0.0,3
1,498,47,18-Sep-61,1,44,1,6,8,6,6,...,0,0.0,0,0.0,0.0,0,0.0,0,0.0,2
2,710,8,03-Sep-61,1,23,1,1,3,7,1,...,0,0.0,0,0.0,0.0,1,0.080077,0,0.0,1
3,433,25,01-Nov-60,1,67,0,0,1,1,0,...,0,0.0,0,0.0,0.0,0,0.0,1,1.0,3
4,44,2,18-May-60,1,40,1,4,6,7,4,...,0,0.0,0,0.0,0.0,0,0.0,1,,4


In [32]:
df_test[["age"]].head(3)

Unnamed: 0,age
0,21
1,44
2,23


In [33]:
df_test[['age']] = df_test[['age']].replace('.d', '30')

In [34]:
df_test["age"].head(3)

0    21
1    44
2    23
Name: age, dtype: object

In [35]:
df_test["med_u5_deaths"].fillna(0,inplace=True)
df_test["med_expenses_sp_ep"].fillna(0,inplace=True)
df_test["edu"].fillna(df_test["edu"].mean(), inplace=True)
df_test[["age"]] = df_test[["age"]].apply(pd.to_numeric)
df_test["age"].fillna(df_test["age"].mean(), inplace=True)

In [36]:
# df_test["survey_date"] = pd.to_datetime(df_train["survey_date"],infer_datetime_format=True)
# df_test['survey_month'] = df_test.survey_date.dt.month
# df_test['survey_dayof_year'] = df_test.survey_date.dt.dayofyear
# df_test["survey_dayof_week"] = df_test["survey_date"].dt.dayofweek #change the full date to day of week

In [37]:
X_test = df_test[['edu','age']]

In [38]:
print(X_test.isnull().sum())

edu    0
age    0
dtype: int64


In [39]:
test_predictions = model.predict(X_test)

In [40]:
df_submit = pd.read_csv('sample_submission.csv-2.xlt', low_memory=False)

In [41]:
d = {'surveyid': df_test["surveyid"], 'depressed': test_predictions}
df_predictions = pd.DataFrame(data=d)
df_predictions = df_predictions[['surveyid','depressed']]

In [42]:
df_predictions.head()

Unnamed: 0,surveyid,depressed
0,901,0
1,498,0
2,710,0
3,433,0
4,44,1


In [43]:
# # histograms
# df_predictions.hist(figsize=(15,20))
# plt.figure()

In [44]:
df_predictions.head(5)

Unnamed: 0,surveyid,depressed
0,901,0
1,498,0
2,710,0
3,433,0
4,44,1


In [45]:
#df_predictions["depressed"].value_count()

In [46]:
df_predictions.to_csv('depressed_prediction.csv', index=False) #save to csv file