In [22]:
from time import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import csv
import numpy as np

In [23]:
training_df = pd.read_csv('training.csv')
test_df = pd.read_csv('test.csv')
y_train = training_df['topic']
y_test = test_df['topic']

#filter model
def make_rel(x):
    if x != 'IRRELEVANT': return 'RELEVANT'
    else: return x
    
training_rel_df = training_df.copy()
test_rel_df = test_df.copy()
training_rel_df['topic'] = training_rel_df['topic'].apply(make_rel)
test_rel_df['topic'] = test_rel_df['topic'].apply(make_rel)

training_cat_df = training_df.copy()
test_cat_df = test_df.copy()

training_cat_df = training_cat_df[training_cat_df.topic != 'IRRELEVANT']
test_cat_df = test_cat_df[test_cat_df.topic != 'IRRELEVANT']



In [39]:
rel_pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range=(2,3), min_df=6)),
    ('tfidf', TfidfTransformer(use_idf=True, sublinear_tf=True)),
    #('sel', SelectKBest(mutual_info_classif, k=1000)),
    ('clf', RandomForestClassifier(n_estimators=500, max_depth=5000,min_samples_leaf=3,random_state=1, n_jobs=-1, verbose=1))
])
rel_pipe.fit(training_rel_df['article_words'], training_rel_df['topic'])
print(rel_pipe.score(test_rel_df['article_words'], test_rel_df['topic']))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   29.7s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s


0.844


[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.3s finished


In [5]:
rel_pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range=(2,3), min_df=6)),
    ('tfidf', TfidfTransformer(use_idf=True, sublinear_tf=True)),
    #('sel', SelectKBest(mutual_info_classif, k=1000)),
    ('clf', LinearSVC(C=0.5, random_state=1))
])
rel_pipe.fit(training_rel_df['article_words'], training_rel_df['topic'])
print(rel_pipe.score(test_rel_df['article_words'], test_rel_df['topic']))

0.856


In [16]:
cat_pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,3), min_df=7)),
    ('tfidf', TfidfTransformer(use_idf=True, sublinear_tf=True)),
    #('sel', SelectKBest(mututal_info_classif, k=100)),
    ('clf', LinearSVC(dual=True, random_state=1))
])

cat_pipe.fit(training_cat_df['article_words'], training_cat_df['topic'])
print(cat_pipe.score(test_cat_df['article_words'], test_cat_df['topic']))

0.7692307692307693


In [21]:
cat_pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,3), min_df=7)),
    ('tfidf', TfidfTransformer(use_idf=True, sublinear_tf=True)),
    #('sel', SelectKBest(mututal_info_classif, k=100)),
    ('clf', SVC(kernel='rbf', random_state=1))
])

cat_pipe.fit(training_cat_df['article_words'], training_cat_df['topic'])
print(cat_pipe.score(test_cat_df['article_words'], test_cat_df['topic']))

0.2948717948717949
