In [1]:
import pandas as pd
import numpy as np

import sklearn

from sklearn.naive_bayes  import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn import svm
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import BaggingClassifier
                        
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import f1_score, accuracy_score


from feature_builder import process_dataset, add_text_embeddings, calculate_keyword_encoding
from hyperparameter_tuning import random_search

In [2]:
x_train = pd.read_csv('train.csv')
y_train=x_train[['id','target']]
test=pd.read_csv('test.csv')
#aw=pd.read_csv('aw.csv')

In [3]:

x_processed = process_dataset(x_train, use_spacy=True)

Percentage of words covered in the embeddings = 0.4937444933920705
Percentage of words covered in the embeddings = 0.5961027457927369


In [4]:
logisticRegr = LogisticRegression(solver='liblinear', penalty='l1', multi_class='auto', max_iter=1000, C=1)

In [5]:
SVC = svm.SVC(degree=10,coef0=10,C=5, probability=True)

In [6]:
catboost = CatBoostClassifier(verbose=False)

In [7]:
xgbooster = XGBClassifier(max_depth=3, n_estimators=600, colsample_bytree=0.9,
                        subsample=0.9, nthread=4, learning_rate=0.05)
BC_XGB = BaggingClassifier(base_estimator= xgbooster, n_estimators=10, random_state=0)

In [8]:
gbm = LGBMClassifier()
BC_LGB = BaggingClassifier(base_estimator= gbm, n_estimators=10, random_state=0)

In [9]:
VC_LSC = VotingClassifier(estimators=[('lr', logisticRegr), ('svc', SVC), ('catboost', catboost),('bc_xgb',BC_XGB),('bc_lgb',BC_LGB)], voting='soft')

In [10]:
VC_LSC.fit(x_processed, y_train['target'])

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1, max_iter=1000,
                                                 penalty='l1',
                                                 solver='liblinear')),
                             ('svc',
                              SVC(C=5, coef0=10, degree=10, probability=True)),
                             ('catboost',
                              <catboost.core.CatBoostClassifier object at 0x000002917A5331F0>),
                             ('bc_xgb',
                              BaggingClassifier(base_estimator=XGBClassifier(base_score=None,
                                                                             booster=None,
                                                                             colsample_bylevel=None,
                                                                             colsampl...
                                                                             missing=nan,
 

In [11]:
test_features=process_dataset(test,use_spacy=True)

Percentage of words covered in the embeddings = 0.5707598689343111
Percentage of words covered in the embeddings = 0.665389037945573


In [12]:
predictions_features = VC_LSC.predict(test_features)

In [7]:
#f1_score(aw['target'], predictions_features)

In [13]:
print(predictions_features)

[1 1 1 ... 1 1 1]


In [14]:
ids = test['id']
final_df = pd.DataFrame({'target': [x for x in predictions_features]}, index=ids)
final_df.head(10)

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,1
9,1
11,1
12,0
21,0
22,0
27,0
29,0


In [15]:
final_df.to_csv('big_ensemble.csv')

In [16]:
final_df['target'].value_counts()

0    2074
1    1189
Name: target, dtype: int64

In [12]:
final_df.count()

target    3263
dtype: int64