# Submission
This is basically the same as the project notebook but the models are trained on the full train data and we generate the submission csv

In [3]:
import pandas as pd
import numpy as np
import re

In [2]:
df_train = pd.read_csv("https://raw.githubusercontent.com/DLaux/BSA2020_Team_Tissot_Project_2/master/data/train.csv", 
                          encoding='utf_8', 
                          dtype = 'unicode',
                          parse_dates = True,
                          infer_datetime_format = True,
                          low_memory=False)

df_test = pd.read_csv("https://raw.githubusercontent.com/DLaux/BSA2020_Team_Tissot_Project_2/master/data/test.csv", 
                          encoding='utf_8', 
                          dtype = 'unicode',
                          parse_dates = True,
                          infer_datetime_format = True,
                          low_memory=False)

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn import utils
from sklearn.model_selection import train_test_split

from preprocess_tweets import preprocess_tweet, remove_stopwords

In [5]:
df_train.text = df_train.text.apply(preprocess_tweet).apply(remove_stopwords)
df_test.text = df_test.text.apply(preprocess_tweet).apply(remove_stopwords)

In [26]:
X = df_train.drop(["target", "id"], axis =1)
y = df_train["target"]

X_test = df_test.drop(["id"], axis =1)

In [28]:
X.keyword = X.keyword.astype("str")
X.location = X.location.astype("str")
X.text = X.text.astype("str")

X_test.keyword = X_test.keyword.astype("str")
X_test.location = X_test.location.astype("str")
X_test.text = X_test.text.astype("str")

In [11]:
#encode the target 
lab_enc = LabelEncoder()
encoded_y = lab_enc.fit_transform(y)

In [31]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import NuSVC

In [15]:
keyword_dtc = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=42)),
    ('dtc', DecisionTreeClassifier(random_state=42, max_depth = 85))
])

keyword_dtc = keyword_dtc.fit(X.keyword, y)

In [18]:
location_abc = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=42)),
    ('abc', AdaBoostClassifier(random_state=42))
])

location_abc = location_abc.fit(X.location, y)

In [21]:
text_mnb = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=42)),
    ('mnb', MultinomialNB())
])

text_mnb = text_mnb.fit(X.text, y)


In [22]:
def add_predictions(X_input):
    #supress a warning
    pd.options.mode.chained_assignment = None  # default='warn'
    
    text_pred = text_mnb.predict_proba(X_input.text)
    location_pred = location_abc.predict_proba(X_input.location)
    keyword_pred = keyword_dtc.predict_proba(X_input.keyword)

    X_input['text_pred'] = text_pred[:,0]
    X_input['location_pred'] = location_pred[:,0]
    X_input['keyword_pred'] = keyword_pred[:,0] 
    
    return True

In [29]:
add_predictions(X)
add_predictions(X_test)

True

In [39]:
X[["text_pred", "location_pred", "keyword_pred"]].head(2)

Unnamed: 0,text_pred,location_pred,keyword_pred
0,0.212053,0.50024,0.260274
1,0.061358,0.50024,0.260274


In [42]:
meta_nusvc = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('nusvc', NuSVC(random_state=42))
])

meta_nusvc = meta_nusvc.fit(X[["text_pred", "location_pred", "keyword_pred"]], y)



In [44]:
predictions = meta_nusvc.predict(X_test[["text_pred", "location_pred", "keyword_pred"]])

In [46]:
df_test["target"] = predictions

In [50]:
submission = df_test.drop(["keyword", "location", "text"], axis = 1)

In [58]:
submission.to_csv(r'../data/submission.csv', index = False, header=True)