In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.2f}'.format

### Read data

In [112]:
PATH = 'titanic/train.csv'

In [171]:
from os.path import splitext

def read_file(path, limit_n_rows=None):
    filename, ext = splitext(path)
    if ext == '.csv':
        return pd.read_csv(path, nrows=limit_n_rows)
    elif ext in ('.xls', '.xlsx'):
        return pd.read_excel(path, nrows=limit_n_rows)
    else:
        return False    

In [172]:
dataset = read_file(PATH)

In [173]:
dataset.shape

(891, 12)

In [174]:
dataset.head()[c for c, col in enumerate(train.columns)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [181]:
[c for c, col in enumerate(dataset.columns)]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

### Determine problem

In [120]:
TARGET_COLUMN = 'Survived'
ID_COLUMN = 'PassengerId'

In [121]:
from collections import Counter
from enum import Enum

UNKNOWN_SONYNOMS = ["unknown", "n/a", "NaN", ""]

class ProblemTypes(Enum):
    BINARY_CLASSIFICATION = 1
    MULTI_LABEL_CLASSIFICATION = 2
    REGRESSION = 3
    CLASSIFICATION = 4
    UNKNOWN = 5


def handle_nans(iterable): return [np.nan if str(i).lower().strip() in UNKNOWN_SONYNOMS else i for i in iterable]                
def contains_string(iterable): return any([type(key) == 'str' for key, count in iterable])
def n_or_more_similar(iterable): return all([count > 1 for key, count in iterable])
def only_2_classes(iterable): return len([i for i in iterable if i is not np.nan])
def dtype_share(iterable, dtype, threshold=0.9): 
    success_count = 0
    for i in iterable:
        try:
            dtype(i)
        except ValueError:
            pass
        else:
            success_count += 1
    return success_count / len(iterable)

def guess_problem_type(df, target_column):
    labels = df[target_column].copy()
    labels = handle_nans(labels)
    c = Counter(labels)    
    top5 = c.most_common()[:5]

    if contains_string(top5) and n_or_more_similar(top5):
        if only_2_classes(top5):
            return ProblemTypes.BINARY_CLASSIFICATION
        else:
            return ProblemTypes.MULTI_LABEL_CLASSIFICATION
    elif dtype_share(top5.keys(), float): 
        return ProblemTypes.REGRESSION
    else:
        return ProblemTypes.UNKNOWN
    

In [None]:
problem_type = guess_problem_type(dataset, TARGET_COLUMN)

In [98]:
problem_type

<ProblemTypes.REGRESSION: 3>

### Analyse label and ID

In [99]:
sample_df = pd.DataFrame({
    "label": [5, 10, 4.3, 30, 40, "a", 3, 3, 3, "b", np.nan],
    "id": np.arange(11)
})

In [103]:
len(sample_df.label.unique())

9

In [100]:
LABEL_ASSESSMENT = {
    "nan_share": 0,
    "CLASSIFICATION": {
        "number_of_categories": 0        
    },
    "REGRESSION": {
        "not_a_number_share": 0                
    }  
}

ID_ASSESSMENT = {
    "nan_share": 0    
}


In [101]:
def evaluate_label_and_id(label_column, id_column, df, 
                   problem_type, label_assessment, id_assessment):
    """ 
    Assess if there are any issues with the labelled data column
    """
    for col in [label_column, id_column]:
        df[col] = handle_nans(df[col])
    label_assessment["nan_share"] = df[label_column].isnull().sum() / len(df[label_column])
    id_assessment["nan_share"] = df[id_column].isnull().sum() / len(df[id_column])
    if problem_type == ProblemTypes.CLASSIFICATION:
        label_assessment[problem_type.name]["number_of_categories"] = len(df[label_column].dropna().unique())
    if problem_type == ProblemTypes.REGRESSION:
        label_assessment[problem_type.name]["not_a_number_share"] = dtype_share(df[label_column], float)
    return {
        "label": label_assessment,
        "id": id_assessment
    }

assessmenet = evaluate_label("label", "id", sample_df, ProblemTypes.REGRESSION, LABEL_ASSESSMENT, ID_ASSESSMENT)

In [102]:
assessmenet

{'id': {'nan_share': 0.0},
 'label': {'CLASSIFICATION': {'number_of_categories': 0},
  'REGRESSION': {'not_a_number_share': 0.8181818181818182},
  'nan_share': 0.090909090909090912}}

### Preprocess features & labels

In [123]:
CATEGORICAL_DTYPES = ["object", "bool"]
def get_categorical_features(df, threshold=0.5):
    return [c for c in df if df[c].dtype in CATEGORICAL_DTYPES]

def find_rare_features(df1, df2, threshold=0.05):    
    df_cat = pd.concat([df1, df2])
    total_count = len(df_cat)
    feature_sums = df_cat.sum()
    return feature_sums[feature_sums / total_count < threshold].index.tolist()

def add_datepart(df, date_column):    
    df["Year_{}".format(date_column)] = df[date_column].dt.year
    df["Month_{}".format(date_column)] = df[date_column].dt.month
    df["Week_{}".format(date_column)] = df[date_column].dt.week
    df["Day_{}".format(date_column)] = df[date_column].dt.day
    df.drop([date_column], axis=1, inplace=True)

def convert_timestamp(df):
    df = df.copy()
    df = df.apply(lambda col: pd.to_datetime(col, errors='ignore') 
          if col.dtypes == object 
          else col, 
          axis=0)
    for col in df.select_dtypes(include=[np.datetime64]):
        print(col)
        add_datepart(df, col)
    return df

def preprocess_features(df):
    return convert_timestamp(df)

def preprocess_labels(ds, apply_log=False):
    if apply_log: ds = ds.apply(np.log)
    return ds

### Run model

In [187]:
from sklearn.metrics import auc
from sklearn.model_selection import cross_val_score, GridSearchCV
from copy import copy
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
import lightgbm as lgb
from sklearn.preprocessing import LabelBinarizer

def _do_grid_search(x, y, model, params, n_folds=5):
    gs = GridSearchCV(estimator=model, param_grid=params, verbose=5)
    gs.fit(x, y)
    print("Best", gs.best_params_, gs.best_score_, gs.grid_scores_)
    return gs.best_estimator_


def train_and_fit(df, parameters, target_column, 
                  id_column, test_size=0.2):
    
    # Test, train split
    df = df.copy()
    
    categorical_features = get_categorical_features(df)
    lb = LabelBinarizer()
    for c in categorical_features:        
        df[c] = lb.fit_transform(df[c].fillna(0)) 
    
    y = df[target_column].values
    df.drop([id_column, target_column], inplace=True, axis=1)
    x = df.values
    x, x_val, y, y_val = train_test_split(x, y, test_size=test_size, random_state=42)          
    
    # LightGBM data containers       
    train_data = lgb.Dataset(x, y)
    val_data = lgb.Dataset(x_val, y_val)
    clf_best = lgb.train(parameters,
                         train_data,
                         valid_sets=val_data,
                         num_boost_round=5000)

    return clf_best

In [195]:
dataset.median()


PassengerId   446.00
Survived        0.00
Pclass          3.00
Age            28.00
SibSp           0.00
Parch           0.00
Fare           14.45
dtype: float64

### Make models

In [178]:
RS=1
np.random.seed(RS)
ROUNDS = 1500 # 1300,1400 all works fine
regression_parameters = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting': 'gbdt',
    'learning_rate': 0.01 , #small learn rate, large number of iterations
    'verbose': 0,
    'num_leaves': 2 ** 5,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': RS,
    'feature_fraction': 0.7,
    'feature_fraction_seed': RS,
    'max_bin': 100,
    'max_depth': 7,
    'num_rounds': ROUNDS,
}

classification_parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'boosting': 'gbdt',
    'learning_rate': 0.01 , #small learn rate, large number of iterations
    'verbose': 0,
    'num_leaves': 2 ** 5,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': RS,
    'feature_fraction': 0.7,
    'feature_fraction_seed': RS,
    'max_bin': 100,
    'max_depth': 7,
    'num_rounds': ROUNDS,
}

In [183]:
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

def select_model(problem_type):
    return {
        ProblemTypes.BINARY_CLASSIFICATION: GradientBoostingClassifier,
        ProblemTypes.MULTI_LABEL_CLASSIFICATION: GradientBoostingClassifier,
        ProblemTypes.REGRESSION: GradientBoostingRegressor      
    }[problem_type]

def create_model(df, test_df, target_column, id_column):
    problem_type = guess_problem_type(df, target_column)
    model = select_model(problem_type)()
    return train_and_fit(
        model=model,
        df=df,        
        kaggle_test_df=test_dataset,
        parameters=parameters,
        target_column=target_column,
        id_column=id_column,
        nvl_strategy="median",
        do_grid=False,
        apply_imbalance=False,
        apply_log=True,
        remove_outliers=True)

def create_classification_model(df, target_column, id_column, parameters):
    return train_and_fit(
        df=df,        
        parameters=parameters,
        target_column=target_column,
        id_column=id_column)

In [188]:
clf_best = create_classification_model(dataset,                                         
                                        TARGET_COLUMN, 
                                        ID_COLUMN,
                                        classification_parameters)

ValueError: Unknown label type: (0                0
1              C85
2                0
3             C123
4                0
5                0
6              E46
7                0
8                0
9                0
10              G6
11            C103
12               0
13               0
14               0
15               0
16               0
17               0
18               0
19               0
20               0
21             D56
22               0
23              A6
24               0
25               0
26               0
27     C23 C25 C27
28               0
29               0
30               0
31             B78
32               0
33               0
34               0
35               0
36               0
37               0
38               0
39               0
40               0
41               0
42               0
43               0
44               0
45               0
46               0
47               0
48               0
49               0
          ...     
841              0
842              0
843              0
844              0
845              0
846              0
847              0
848              0
849            C92
850              0
851              0
852              0
853            D28
854              0
855              0
856              0
857            E17
858              0
859              0
860              0
861              0
862            D17
863              0
864              0
865              0
866              0
867            A24
868              0
869              0
870              0
871            D35
872    B51 B53 B55
873              0
874              0
875              0
876              0
877              0
878              0
879            C50
880              0
881              0
882              0
883              0
884              0
885              0
886              0
887            B42
888              0
889           C148
890              0
Name: Cabin, Length: 891, dtype: object,)

In [153]:
clf_best.

[]

In [143]:
predict=clf_best.predict(test_X)
predict=np.exp(predict)
output=pd.DataFrame({'id':test_ids,'price_doc':predict})
output.to_csv('lgb_mik.csv',index=False)

NameError: name 'clf_best' is not defined

### Evaluate model performance

In [101]:
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y_pred), 2)))

In [103]:
test_preds = clf_best.predict(test_X)
test_preds = np.exp(test_preds)
test_rmsle = rmsle(test_y, test_preds)

In [104]:
test_rmsle

0.45879505773826001

### Make predictions

In [33]:
test_dataset = pd.read_csv('/Users/mikkeld/ai-for-analysts/datasets/binary-classification/sberbank/test.csv')

In [41]:
def make_predictions(test_df):
    test_ids = test_df[ID_COLUMN]
    test_df = test_df.drop([ID_COLUMN], axis=1)
    test_df = preprocess_features(test_df)
    test_preds = clf_best.predict(test_df)
    test_preds = np.exp(test_preds)
    output =  pd.DataFrame({
        "id": test_ids,
        "price_doc": test_preds
    })    
    output.to_csv('lgb_mik.csv',index=False)

In [22]:
from sklearn.model_selection import train_test_split,KFold,TimeSeriesSplit
from sklearn import model_selection, preprocessing
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import model_selection, preprocessing
import pdb

def process(train,test):
    RS=1
    np.random.seed(RS)
    ROUNDS = 1500 # 1300,1400 all works fine
    params = {
        'objective': 'regression',
            'metric': 'rmse',
            'boosting': 'gbdt',
            'learning_rate': 0.01 , #small learn rate, large number of iterations
            'verbose': 0,
            'num_leaves': 2 ** 5,
            'bagging_fraction': 0.95,
            'bagging_freq': 1,
            'bagging_seed': RS,
            'feature_fraction': 0.7,
            'feature_fraction_seed': RS,
            'max_bin': 100,
            'max_depth': 7,
            'num_rounds': ROUNDS,
        }
    #Remove the bad prices as suggested by Radar
    train=train[(train.price_doc>1e6) & (train.price_doc!=2e6) & (train.price_doc!=3e6)]
    #train.loc[(train.product_type=='Investment') & (train.build_year<2000),'price_doc']*=0.9 
    #train.loc[train.product_type!='Investment','price_doc']*=0.969 #Louis/Andy's magic number
    test = pd.read_csv('/Users/mikkeld/ai-for-analysts/datasets/binary-classification/sberbank/test.csv',parse_dates=['timestamp'])

  
    id_test = test.id
    times=pd.concat([train.timestamp,test.timestamp])
    num_train=train.shape[0]
    y_train = train["price_doc"]
    train.drop(['price_doc'],inplace=True,axis=1)
    da=pd.concat([train,test])
    da['na_count']=da.isnull().sum(axis=1)
    df_cat=None
    to_remove=[]
    for c in da.columns:
        if da[c].dtype=='object':
            oh=pd.get_dummies(da[c],prefix=c)
            if df_cat is None:
                df_cat=oh
            else:
                df_cat=pd.concat([df_cat,oh],axis=1)
            to_remove.append(c)
    da.drop(to_remove,inplace=True,axis=1)

    #Remove rare features,prevent overfitting
    to_remove=[]
    if df_cat is not None:
        sums=df_cat.sum(axis=0)
        to_remove=sums[sums<200].index.values
        df_cat=df_cat.loc[:,df_cat.columns.difference(to_remove)]
        da = pd.concat([da, df_cat], axis=1)
    x_train=da[:num_train].drop(['timestamp','id'],axis=1)
    x_test=da[num_train:].drop(['timestamp','id'],axis=1)
    #Log transformation, boxcox works better.
    y_train=np.log(y_train)
    train_lgb=lgb.Dataset(x_train,y_train)
    model=lgb.train(params,train_lgb,num_boost_round=ROUNDS)
    predict=model.predict(x_test)
    predict=np.exp(predict)
    return predict,id_test

In [23]:
train = pd.read_csv('/Users/mikkeld/ai-for-analysts/datasets/binary-classification/sberbank/train.csv', parse_dates=['timestamp'])
test = pd.read_csv('/Users/mikkeld/ai-for-analysts/datasets/binary-classification/sberbank/test.csv', parse_dates=['timestamp'])
predict,id_test=process(train,test)
output=pd.DataFrame({'id':id_test,'price_doc':predict})
output.to_csv('lgb3.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
