In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,StandardScaler,LabelEncoder
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [59]:
send=1
# read training set
if send==0:
  train_df_raw=pd.read_csv("readonly/train.csv",encoding='cp1252')
else:
  train_df_raw=pd.read_csv("train.csv",encoding='cp1252')
# make a copy
train_df=train_df_raw.copy()
# read addresses
if send==0:
  add_df=pd.read_csv("readonly/addresses.csv",encoding='cp1252')
else:
  add_df=pd.read_csv("addresses.csv",encoding='cp1252')
# merge with train_df
train_df=train_df.merge(add_df, on='ticket_id', how='left')
# read latlon
if send==0:
  latlon_df=pd.read_csv("readonly/latlons.csv",encoding='cp1252')
else:
  latlon_df=pd.read_csv("latlons.csv",encoding='cp1252')
# merge with train_df
train_df=train_df.merge(latlon_df, on='address', how='left')
# select obvious features
features=["fine_amount","lat","lon","disposition","compliance"]
train_df=train_df[features]
# drop NA rows
train_df=train_df.dropna()
# force targets to int 
train_df["compliance"]=train_df["compliance"].astype(int)
# list of num features
num_ix=["fine_amount","lat","lon"]
# list of cat features
cat_ix=["disposition"]
# split the dataframe into X and y
X=train_df.loc[:, train_df.columns != 'compliance']
y=train_df.loc[:,'compliance']
# transform numerical features
scaler=MinMaxScaler()
scaler.fit(X[num_ix])
Xnum=scaler.transform(X[num_ix])
Xnum_df=pd.DataFrame(Xnum,columns=X[num_ix].columns)
# transform categorical features
Xcat_df = pd.get_dummies(X[cat_ix])
# concatenate nums+cats
X_df = pd.concat([Xnum_df.reset_index(drop=True), Xcat_df.reset_index(drop=True)], axis=1)
# fit model
model=RandomForestClassifier(n_estimators=200,max_depth=10)
model.fit(X_df,y)
# read test set
if send==0:
  test_df_raw=pd.read_csv("readonly/test.csv",encoding='cp1252')
else:
  test_df_raw=pd.read_csv("test.csv",encoding='cp1252')
# make a copy
test_df=test_df_raw.copy()
# merge addresses
test_df=test_df.merge(add_df, on='ticket_id', how='left')
# merge latlon
test_df=test_df.merge(latlon_df, on='address', how='left')
# select obvious features
features=["fine_amount","lat","lon","disposition"]
test_df=test_df[features]
# fill NAs with modes
test_df['lat'].fillna(test_df['lat'].mode()[0], inplace=True)
test_df['lon'].fillna(test_df['lon'].mode()[0], inplace=True)
# transform numerical features
X_test_num=scaler.fit_transform(test_df[num_ix])
Xnum_df_test=pd.DataFrame(X_test_num,columns=X[num_ix].columns)
# transform categorical features
Xcat_df_test = pd.get_dummies(test_df[cat_ix])
Xcat_df_test=Xcat_df_test.reindex(columns=Xcat_df.columns, fill_value=0,)
# concatenate nums+cats
X_test_df = pd.concat([Xnum_df_test.reset_index(drop=True), Xcat_df_test.reset_index(drop=True)], axis=1)
probs=model.predict_proba(X_test_df)[:,1]
# generate series
ans = pd.Series(probs, index=test_df_raw['ticket_id'])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [60]:
scores=cross_val_score(model,X_df,y,cv=5,scoring="roc_auc")
np.mean(scores)

0.7756417948320647