In [2]:
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
import numpy as np
import xgboost as xgb
import gc

from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
import statsmodels.api as sm
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


In [3]:
def reduce_mem_usage(df):
    """ 
    iterate through all the columns of a dataframe and 
    modify the data type to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage of dataframe is {:.2f}' 
                     'MB').format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max <\
                  np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max <\
                   np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max <\
                   np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max <\
                   np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max <\
                   np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max <\
                   np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage after optimization is: {:.2f}' 
                              'MB').format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) 
                                             / start_mem))
    
    return df

In [37]:
df_train = pd.read_csv('onetwotrip_challenge_train.csv')
#     df[geo].fillna(0, inplace=True)
features = list(filter(lambda x: 'field' in x, df_train.columns))
X_train = df_train[features]  #independent columns

xgb_clf = xgb.XGBClassifier(objective="binary:logistic",
                                    nthread=4, 
                                    seed=42,
                                    gamma=0.01,
                                    learning_rate=0.01,
                                    max_depth=7,
                                    n_estimators=100,
                                    reg_lambda=0.01)
xgb_clf.fit(df_train[features], df_train['goal1'])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0.01,
              learning_rate=0.01, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=4, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=0.01, scale_pos_weight=1, seed=42,
              silent=None, subsample=1, verbosity=1)

In [35]:
# тут по частям считываем dataset
# для первой части датасета можно вне цикла это сделать
name_model = None
# xgb_estimator = xgb_clf
xgb_params = {
  'update':'refresh',
  'process_type': 'update',
  'refresh_leaf': True,
  'silent': True,
  }

chunks = 3
for i in range(1, chunks):
    df_train = pd.read_csv('onetwotrip_challenge_train.csv')
#     df[geo].fillna(0, inplace=True)
    features = list(filter(lambda x: 'field' in x, df_train.columns))
    X_train = df_train[features]  #independent columns
    y_train = df_train['goal1']   #target column i.e price range
    
#     xgb_estimator = xgb.train(xgb_params, 
#                             dtrain=xgb.DMatrix(X_train, y_train),
# #                             evals=(xgb.DMatrix(xvalid, yvalid),"Valid"),
#                             # Pass partially trained model:
#                             xgb_model = xgb_estimator)
#     xgb_clf = xgb.XGBClassifier(objective="binary:logistic",
#                                     nthread=4, 
#                                     seed=42,
#                                     gamma=0.01,
#                                     learning_rate=0.01,
#                                     max_depth=7,
#                                     n_estimators=100,
#                                     reg_lambda=0.01)
    
    xgb_model = xgb.XGBClassifier(eval_metric='auc')
    xgb_model.fit(X_train,
                  y_train,
                  xgb_model = name_model)
    name_model = 'xgb_model'
    xgb_model.save_model(name_model)
    y_pred = xgb_model.predict_proba(X_train)
    print('Train score:{}'.format(roc_auc_score(y_train, y_pred[:,1])))
    
    del df_train, X_train, y_train
    gc.collect()

None
Train score:0.7069575754289972
None
Train score:0.7249292148394441


In [38]:
reduce_mem_usage(df_train)

Memory usage of dataframe is 64.32MB
Memory usage after optimization is: 17.09MB
Decreased by 73.4%


Unnamed: 0,orderid,userid,field0,field1,field2,field3,field4,field5,field6,field7,...,indicator_goal22,indicator_goal23,indicator_goal24,indicator_goal25,goal21,goal22,goal23,goal24,goal25,goal1
0,0,10d654494cbe97bbb25d51ead2600679aff9e097924add...,0,-0.626465,11,12,1,1,0,1,...,1,0,1,1,0,1,0,0,0,0
1,1,4aafc0391f72bbcf60537aece62923baf9ce644b64ac36...,144,-0.393799,5,7,2,0,0,2,...,1,0,1,0,0,0,0,0,0,0
2,2,bac8ffef46348f587c8d17137ab01fb24aef21547c647d...,134,-0.548828,2,3,2,0,0,1,...,1,0,1,1,0,0,0,0,0,0
3,3,0392247b4b87674aba2c32bf2292b105771a6a376871be...,0,-0.238647,10,11,1,1,3,2,...,1,0,1,1,0,0,0,0,0,0
4,4,d1aeefef311bbeb4bd84876c8d49421f276674527d5578...,0,-0.704102,8,11,1,1,0,1,...,1,0,0,1,0,0,0,0,0,0
5,5,bd2f85e3ed0cfa6ce641f632e4cfa35e170336ec5408cb...,0,-0.316162,9,9,1,1,0,1,...,1,0,0,0,0,0,0,0,0,0
6,6,6c6610f18ccd71f5ce664f9a840883d5d4b790d3adb88f...,0,0.071655,10,10,1,1,0,1,...,1,1,1,1,0,1,0,0,0,0
7,7,daa23cdeee4840e522def77d76c05e4c9a1b64980d0aaa...,0,-0.238647,10,11,1,1,1,1,...,1,1,1,0,0,1,1,0,0,0
8,8,c58f19409e035aafe7fd70b684ccd917402912c10b8a81...,0,-0.005936,8,8,1,1,0,1,...,1,1,0,1,0,1,0,0,0,0
9,9,0f5ac06456131b14993ca38465ea8f1c8867d55c53cb32...,156,1.623047,2,2,3,0,6,2,...,1,0,0,0,0,0,0,0,0,0
