In [2]:
#https://www.kaggle.com/c/costa-rican-household-poverty-prediction/data

# <font color="#2086BA">1. Importing packages and Data</font>

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc

%matplotlib inline

In [4]:
folderPath = "D:/Rep/MyLearning/competitions/kaggle/CostoRiconHVP";

In [5]:
fileName = "train.csv"
df_train = pd.read_csv(os.path.join(folderPath,fileName))

In [6]:
fileName = "test.csv"
df_test = pd.read_csv(os.path.join(folderPath,fileName))

In [7]:
data = pd.concat([df_train,df_test])

In [8]:
data.head()

Unnamed: 0,Id,SQBage,SQBdependency,SQBedjefe,SQBescolari,SQBhogar_nin,SQBhogar_total,SQBmeaned,SQBovercrowding,Target,...,television,tipovivi1,tipovivi2,tipovivi3,tipovivi4,tipovivi5,v14a,v18q,v18q1,v2a1
0,ID_279628684,1849,0.0,100,100,0,1,100.0,1.0,4.0,...,0,0,0,1,0,0,1,0,,190000.0
1,ID_f29eb3ddd,4489,64.0,144,144,0,1,144.0,1.0,4.0,...,0,0,0,1,0,0,1,1,1.0,135000.0
2,ID_68de51c94,8464,64.0,0,121,0,1,121.0,0.25,4.0,...,0,1,0,0,0,0,1,0,,
3,ID_d671db89c,289,1.0,121,81,4,16,121.0,1.777778,4.0,...,0,0,0,1,0,0,1,1,1.0,180000.0
4,ID_d56d6f5f5,1369,1.0,121,121,4,16,121.0,1.777778,4.0,...,0,0,0,1,0,0,1,1,1.0,180000.0


# <font color="#2086BA">2. Data Cleaning and Transformation</font> 

In [10]:
missing_df = data.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df = missing_df[missing_df['missing_count']>0]
missing_df = missing_df.sort_values(by='missing_count')
missing_df

Unnamed: 0,column_name,missing_count
7,SQBmeaned,36
77,meaneduc,36
9,Target,23856
142,v2a1,24263
141,v18q1,25468
120,rez_esc,27581


In [17]:
print('Shape of train data :',df_train.shape)
print('Shape of train data :',df_test.shape)

Shape of train data : (9557, 143)
Shape of train data : (23856, 142)


In [20]:
print('Number of household in train data:',df_train['idhogar'].nunique())
print('Number of household in test data:',df_test['idhogar'].nunique())

Number of household in train data: 2988
Number of household in test data: 7352


In [23]:
print('Number of head of families in train data :',df_train['parentesco1'].sum())
print('Number of head of families in test data:',df_test['parentesco1'].sum())

Number of head of families in train data : 2973
Number of head of families in test data: 7334


In [25]:
familyheads = data.groupby('idhogar').agg({'parentesco1':'sum'}).reset_index()
familyheads.columns = ['idhogar','numberofheads']

In [29]:
print('The number of families with no heads:',len(familyheads[familyheads['numberofheads']==0]))

The number of families with no heads: 33


In [48]:
data['meaneduc'].fillna(0,inplace=True)
data['SQBmeaned'].fillna(0,inplace=True)
data['v18q1'].fillna(0,inplace=True)
data['rez_esc'].fillna(0,inplace=True)
data['v2a1'].fillna(0,inplace=True)

In [34]:
datatypes = data.dtypes.reset_index()
datatypes.columns = ["Count", "Column Type"]
datatypes[datatypes['Column Type']=='object']

Unnamed: 0,Count,Column Type
0,Id,object
21,dependency,object
23,edjefa,object
24,edjefe,object
60,idhogar,object


In [36]:
data['edjefe']

0        10
1        12
2        no
3        11
4        11
5        11
6        11
7         9
8         9
9         9
10        9
11       no
12       no
13       no
14       no
15       no
16       no
17       no
18       no
19       15
20       15
21        4
22        4
23        6
24        6
25        6
26       15
27        6
28        6
29        6
         ..
23826    no
23827    no
23828    no
23829    no
23830     9
23831     9
23832     6
23833     6
23834     6
23835    no
23836    no
23837     4
23838     4
23839     4
23840     4
23841     4
23842     4
23843    no
23844    no
23845    no
23846     5
23847     5
23848     5
23849     5
23850     5
23851     5
23852     6
23853     6
23854     6
23855     6
Name: edjefe, Length: 33413, dtype: object

# <font color="#2086BA">3. Data Exploration</font> 

# <font color="#2086BA">4. Feature Engineering</font> 

# <font color="#2086BA">5. Predictive Modelling</font> 

In [65]:
prediction_var = [x for x in df_train.columns if x not in ['Id','dependency','edjefa','edjefe','idhogar','parentesco1','Target']]
target_var = 'Target'

In [71]:
train = data[(data[target_var].notnull())]
test = data[(data[target_var].isnull())]
test.drop(['Target'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [72]:
from sklearn.cross_validation import KFold   #For K-fold cross validation
from sklearn import metrics
#Generic function for making a classification model and accessing performance:
def classification_model(model, data, predictors, outcome):
    #Fit the model:
    model.fit(data[predictors],data[outcome])
    #Make predictions on training set:
    predictions = model.predict(data[predictors])
    #Print accuracy
    accuracy = metrics.accuracy_score(predictions,data[outcome])
    print("Accuracy : %s" % "{0:.3%}".format(accuracy))

    #Perform k-fold cross-validation with 5 folds
    kf = KFold(data.shape[0], n_folds=5)
    error = []
    for train, test in kf:
        # Filter training data
        train_predictors = (data[predictors].iloc[train,:])
    
        # The target we're using to train the algorithm.
        train_target = data[outcome].iloc[train]
    
        # Training the algorithm using the predictors and target.
        model.fit(train_predictors, train_target)
    
        #Record error from each cross-validation run
        error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
 
        print("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))

        #Fit the model again so that it can be refered outside the function:
        model.fit(data[predictors],data[outcome]) 

In [73]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classification_model(model,train,prediction_var,target_var)

Accuracy : 65.732%
Cross-Validation Score : 76.778%
Cross-Validation Score : 74.242%
Cross-Validation Score : 71.315%
Cross-Validation Score : 67.641%
Cross-Validation Score : 65.312%


In [74]:
len(test)

23856

In [79]:
test["Target"] = model.predict(test[prediction_var])
test[["Id","Target"]].to_csv(os.path.join(folderPath,"CostaRiconPovertyPrediction_LR_12092018.csv"),index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


# <font color="#2086BA">6. Evaluation</font> 

In [82]:
test[["Id","Target"]].dtypes

Id         object
Target    float64
dtype: object