In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.utils import class_weight
from scipy.stats import uniform as sp_uniform

In [2]:
n_folds = 5

In [3]:
train_data = pd.read_csv("../input/train.csv")
test_data = pd.read_csv("../input/test.csv")

In [4]:
target = train_data.pop('Target')
test_id = test_data.pop('Id')

In [5]:
def remove_str(df):
    df = df.applymap(lambda x: 0 if x == 'no' else (1 if x == 'yes' else x))

In [6]:
def remove_object_from_dependency(df):
    df['dependency'] = np.sqrt(df['SQBdependency'])

In [7]:
def calculate_edu(row):
    if (row['edjefe'] == 'yes' and row['edjefa'] == 'no') or (row['edjefe'] == 'no' and row['edjefa'] == 'yes'):
        return 1
    if row['edjefe'] == 'no' and row['edjefa'] == 'no':
        return 0
    if row['edjefe'] == 'yes' or row['edjefe'] == 'no':
        return pd.to_numeric(row['edjefa'])
    return pd.to_numeric(row['edjefe'])

def clean_edu(df):
    df['house_holder_edu'] = df.apply(calculate_edu, axis=1).values

In [8]:
def calculate_mean_edu(group):
    avg = group[group['age'] >= 18]['escolari'].mean()
    group['meaneduc_'] = avg if avg else 0.0
    return group

def clean_mean_edu(df):
    df['meaneduc'] = df[['idhogar', 'age', 'escolari']].groupby('idhogar').apply(calculate_mean_edu)['meaneduc_']
    df['SQBmeaned'] = np.square(df['meaneduc'])

In [9]:
def f1_cv(model, train, label):
    kf = KFold(n_folds, shuffle=True, random_state=42)
    return cross_val_score(model, train, label, scoring=make_scorer(f1_score, average = "macro"), cv = kf)

In [10]:
def add_more_feature(df):
    df['Rent_per_individual'] = df['v2a1'] / df['r4t3']
    #df['Rent_per_child'] = df['v2a1'] / df['r4t1']
    df['Rent_per_over_65'] = df['v2a1'] / df['r4t3']
    df['Rent_per_room'] = df['v2a1'] / df['rooms']
    df['Rent_per_bedrooms'] = df['v2a1'] / df['bedrooms']
    df['Proportion_under_12'] = df['r4t1'] / df['r4t3']
    df['Proportion_under_12_male'] = df['r4h1'] / df['r4t3']
    df['Proportion_under_12_female'] = df['r4m1'] / df['r4t3']
    df['Proportion_male'] = df['r4h3'] / df['r4t3']
    df['Proportion_female'] = df['r4m3'] / df['r4t3']
    df['Rooms_per_individual'] = df['rooms'] / df['r4t3']
    #df['Rooms_per_child'] = df['rooms'] / df['r4t1']
    df['Tablets_per_individual'] = df['v18q1'] / df['r4t3']
    #df['Tablets_per_child'] = df['v18q1'] / df['r4t1']
    df['Years_schooling_per_individual'] = df['escolari'] / df['r4t3']
    df['Years_schooling_per_adult'] = df['escolari'] / (df['r4t3'] - df['r4t1'])
    #df['Years_schooling_per_child'] = df['escolari'] / df['r4t1']
    df['Proportion_under_19'] = df['hogar_nin'] / df['r4t3']
    df['Proportion_over_19'] = df['hogar_adul'] / df['r4t3']
    df['Proportion_under_65'] = (df['hogar_total'] - df['hogar_mayor']) / df['r4t3']
    df['Proportion_over_65'] = df['hogar_mayor'] / df['r4t3']
    df['Bedrooms_per_individual'] = df['bedrooms'] / df['r4t3']
    #df['Bedrooms_per_child'] = df['bedrooms'] / df['r4t1']
    df['Bedrooms_per_over_65'] = df['bedrooms'] / df['r4t3']
    df['Extreme_conditions_flag'] = (df['abastaguano'] & df['noelec'] & df['sanitario1'] & df['energcocinar1'])
    df['bedrooms_to_rooms'] = df['bedrooms'] / df['rooms']
    df['tamhog_to_rooms'] = df['tamhog'] / df['rooms']
    df['tamhog_to_bedrooms'] = df['tamhog'] / df['bedrooms']
    df['r4t3_to_tamhog'] = df['r4t3'] / df['tamhog']
    df['hhsize_to_rooms'] = df['hhsize'] / df['rooms']
    df['hhsize_to_bedrooms'] = df['hhsize'] / df['bedrooms']
    df['rent_to_hhsize'] = df['v2a1'] / df['hhsize']
    df['qmobilephone_to_r4t3'] = df['qmobilephone'] / df['r4t3']
    #df['qmobilephone_to_v18q1'] = df['qmobilephone'] / df['v18q1']

In [11]:
remove_str(train_data)
remove_str(test_data)

In [12]:
remove_object_from_dependency(train_data)
remove_object_from_dependency(test_data)

In [13]:
clean_edu(train_data)
clean_edu(test_data)

In [14]:
clean_mean_edu(train_data)
clean_mean_edu(test_data)

In [15]:
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

In [16]:
add_more_feature(train_data)
add_more_feature(test_data)

In [17]:
train_data.drop(['idhogar', 'Id', 'edjefe', 'edjefa'], axis=1, inplace=True)
test_data.drop(['idhogar', 'edjefe', 'edjefa'], axis=1, inplace=True)

In [18]:
train_data.shape

(9557, 167)

In [19]:
test_data.shape

(23856, 167)

In [20]:
x1, x2, y1, y2 = model_selection.train_test_split(train_data, target, test_size=0.20, random_state=5)
model = RandomForestClassifier(n_jobs=-1, random_state=7)
model.fit(x1, y1)
print(f1_score(y2, model.predict(x2), average='macro'))

0.856977589514


In [21]:
col = pd.DataFrame({'importance': model.feature_importances_,
                    'feature': train_data.columns}).sort_values(by=['importance'], ascending=[False])['feature'].values

In [22]:
col

array(['meaneduc', 'SQBmeaned', 'house_holder_edu', 'SQBdependency',
       'qmobilephone', 'SQBovercrowding', 'SQBhogar_nin', 'overcrowding',
       'dependency', 'SQBedjefe', 'rooms', 'age', 'agesq', 'r4t1',
       'SQBage', 'r4h3', 'r4t2', 'cielorazo', 'r4h2', 'hogar_nin', 'v2a1',
       'r4m3', 'hogar_adul', 'hhsize', 'bedrooms', 'eviv3', 'tamviv',
       'r4m1', 'escolari', 'r4m2', 'SQBescolari', 'v18q1', 'epared3',
       'r4t3', 'tamhog', 'r4h1', 'hogar_total', 'paredblolad', 'lugar1',
       'energcocinar2', 'SQBhogar_total', 'energcocinar3', 'paredpreb',
       'epared2', 'area1', 'area2', 'pisomoscer', 'etecho3', 'paredmad',
       'hogar_mayor', 'lugar4', 'tipovivi1', 'tipovivi3', 'pisocemento',
       'television', 'etecho1', 'etecho2', 'sanitario3', 'eviv1',
       'tipovivi5', 'epared1', 'eviv2', 'lugar5', 'v18q', 'sanitario2',
       'energcocinar4', 'elimbasu3', 'lugar2', 'pisomadera', 'coopele',
       'public', 'lugar3', 'elimbasu1', 'paredzocalo', 'tipovivi2',
      

In [20]:
train_p = train_data[train_data['parentesco1'] == 1]
target_p = target.loc[train_data['parentesco1'] == 1]

In [21]:
class_weights = class_weight.compute_class_weight('balanced', np.unique(target), target)
class_weights = dict(zip(np.unique(target.values - 1), class_weights))

In [66]:
model_lgb = lgb.LGBMClassifier(objective='multiclass',
                               num_leaves=144,
                               num_class=len(np.unique(target)),
                               learning_rate=0.001,
                               n_estimators=600,
                               max_depth=16,
                               feature_fraction=0.85,
                               bagging_fraction=0.85,
                               bagging_freq=5,
                               min_child_samples=10,
                               subsample=0.85,
                               num_threads=6,
                               class_weight=class_weights)

In [67]:
score_p = f1_cv(model_lgb, train_p, target_p - 1)
print("LGBM model score for house holder: {:.4f} ({:.4f})\n".format(score_p.mean(), score_p.std()))

#score = f1_cv(model_lgb, train_data, target - 1)
#print("LGBM model score for all: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

LGBM model score for house holder: 0.4157 (0.0206)



In [None]:
def evaluate_macroF1_lgb(truth, predictions):
    pred_labels = predictions.reshape(len(np.unique(truth)),-1).argmax(axis=0)
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', f1, True) 

In [24]:
model_lgb.fit(train_data,
              target - 1,
              eval_metric=evaluate_macroF1_lgb)

LGBMClassifier(bagging_fraction=0.85, bagging_freq=5, boosting_type='gbdt',
        class_weight={0: 3.1645695364238411, 1: 1.4960864120225423, 2: 1.9762200165425972, 3: 0.39847398265510342},
        colsample_bytree=1.0, feature_fraction=0.85, learning_rate=0.001,
        max_depth=20, metric='multi_logloss', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=800,
        n_jobs=-1, num_class=4, num_leaves=144, num_threads=6,
        objective='multiclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)

In [24]:
pred = model_lgb.predict(test_data)

In [25]:
submit = pd.DataFrame()
submit['Id'] = test_id

In [26]:
submit['Target'] = pred + 1
submit.to_csv('sample_prediction.csv', index=False)