In [1]:
import os
import numpy as np
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

from data_preparation import DataPrep
from feature_engineering import FeatureEngineer
#from cnn_text import CNN_Text
#from rnn_text import RNN_Text

In [2]:
def train_model(classifier, feature_vector_train, train_label, feature_vector_valid, valid_label):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, train_label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    return metrics.accuracy_score(predictions, valid_label)

In [3]:
def logistic_regression_model(data, features):
    # Linear Classifier on Count Vectors
    accuracy = train_model(linear_model.LogisticRegression(), features.X_train_count, data.y_train, features.X_valid_count, data.y_valid)
    print("LR, Count Vectors Validation: ", accuracy) # 0.6958262980160406

    # Linear Classifier on Word Level TF IDF Vectors
    accuracy = train_model(linear_model.LogisticRegression(), features.X_train_tfidf, data.y_train, features.X_valid_tfidf, data.y_valid)
    print("LR, WordLevel TF-IDF Validation: ", accuracy) # 0.6873733642887294

    # Linear Classifier on Ngram Level TF IDF Vectors
    accuracy = train_model(linear_model.LogisticRegression(), features.X_train_tfidf_ngram, data.y_train, features.X_valid_tfidf_ngram, data.y_valid)
    print("LR, N-Gram Vectors Validation: ", accuracy) # 0.660658505698607
    
    # Linear Classifier on Character Level TF IDF Vectors
    accuracy = train_model(linear_model.LogisticRegression(), features.X_train_tfidf_ngram_chars, data.y_train, features.X_valid_tfidf_ngram_chars, data.y_valid)
    print("LR, CharLevel Vectors Validation: ", accuracy) # 0.6990977205571971

In [4]:
def main():
    data = DataPrep()

    features = FeatureEngineer(data)

    logistic_regression_model(data, features)
    
    #cnn_model()
    
    #rnn_model()

In [5]:
if __name__ == "__main__":
    main()
    



LR, Count Vectors Validation:  0.6958262980160406
LR, WordLevel TF-IDF Validation:  0.6873733642887294
LR, N-Gram Vectors Validation:  0.660658505698607
LR, CharLevel Vectors Validation:  0.6990977205571971
