In [1]:
import time
import pandas as pd
from data_loader import load_data
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from nlp_processing import LemmaCountVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from naive_bayes import BernoulliNaiveBayes

In [2]:
# import re
# # taken from https://towardsdatascience.com/sentiment-analysis-with-python-part-1-5ce197074184
# REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
# REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

# def preprocess_reviews(reviews):
#     reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
#     reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
#     return "".join(reviews)

# Feature Engineering and Training

In [3]:
# read data
train, test = load_data()
full_text = list(train.iloc[:, 1].values) + list(test.iloc[:, 1].values)
# raw training and test data
X_train = train.iloc[:,1].values
X_test = test.iloc[:,1].values
y = train.iloc[:,2].values.astype(int)

"Train(X-%s, y-%s), Test(X-%s)"%(X_train.shape, y.shape, X_test.shape)

'Train(X-(25000,), y-(25000,)), Test(X-(25000,))'

In [4]:
score = 'f1'

## Logistic Regression

Feature engineering considering TF-IDF

In [None]:
parameters_regression_tfidf = {
    'vec__min_df': (1, .3, .4, .5),
    'vec__stem': (True, False),
    'vec__ngram_range':((1, 1), (1, 2), (2, 2)),
    'tfidf__norm': ('l1', 'l2'),
    'tfidf__smooth_idf': (True, False),
    'clf__fit_intercept': (True, False), 
    'clf__C': (1,2,3, 0.05, 0.1, 1.5), 
}

pipeline_regression_tfidf = Pipeline([
    ('vec', LemmaCountVectorizer(strip_accents='unicode', stop_words=None, binary=False)),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(solver='saga', penalty='l2'))
])
                  
rs_regression_tfidf = RandomizedSearchCV(pipeline_regression_tfidf, parameters_regression_tfidf, 
                                   cv=15, scoring=score, n_jobs=-1, verbose=0, random_state=62)
start = time.time()
rs_regression_tfidf.fit(X_train, y)
time.time() - start, rs_regression_tfidf.best_params_, rs_regression_tfidf.best_score_

Feature engineering BoW

In [None]:
parameters_regression_bow = {
    'vec__min_df': (1, .3, .4, .5),
    'vec__stem': (True, False),
    'vec__ngram_range':((1, 1), (1, 2), (2, 2)),
    'vec__binary': (True, False)
    'clf__fit_intercept': (True, False), 
    'clf__C': (1,2,3, 0.05, 0.1, 1.5), 
}

pipeline_regression_bow = Pipeline([
    ('vec', LemmaCountVectorizer(strip_accents='unicode', stop_words=None)),
    ('clf', LogisticRegression(solver='saga', penalty='l2'))
])
                  
rs_regression_bow = RandomizedSearchCV(pipeline_regression_bow, parameters_regression_bow, 
                                   cv=15, scoring=score, n_jobs=-1, verbose=0, random_state=62)
start = time.time()
rs_regression_bow.fit(X_train, y)
time.time() - start, rs_regression_bow.best_params_, rs_regression_bow.best_score_

## Multinomial Naive Bayes

Feature engineering considering BoW

In [None]:
parameters_mnv_bow = {
    'vec__min_df': (1, .3, .4, .5),
    'vec__stem': (True, False),
    'vec__ngram_range':((1, 1), (1, 2), (2, 2)),
    'clf__alpha': (0, 1, 1.5, 2, 2.5, 3)
}

pipeline_mnv_bow = Pipeline([
    ('vec', LemmaCountVectorizer(strip_accents='unicode', stop_words=None, binary=False)),
    ('clf', MultinomialNB())
])
                  
rs_mnv_bow = RandomizedSearchCV(pipeline_mnv_bow, parameters_mnv_bow, 
                                   cv=15, scoring=score, n_jobs=-1, verbose=0, random_state=62)
start = time.time()
rs_mnv_bow.fit(X_train, y)
time.time() - start, rs_mnv_bow.best_params_, rs_mnv_bow.best_score_

## Bernoulli Naive Bayes

## Support Vector Machine

In [None]:
parameters_svm = {'C':[1], 'kernel': ['linear', 'rbf']}
svm = SVC()

# Model selection

In [12]:
X_train = train.iloc[:,1].values
X_test = test.iloc[:,1].values
y = train.iloc[:,2].values.astype(int)

# Submission

In [24]:
# select best model
model = random_search.best_estimator_
y_pred = model.predict(X_test)
test['Category'] = y_pred

# submission results
submission = test.drop(columns='Text')
submission.to_csv('../data/submission.csv', index=False)