In [82]:
from time import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import FunctionTransformer
import csv
import numpy as np

In [6]:
training_df = pd.read_csv('training.csv')
test_df = pd.read_csv('test.csv')
y_train = training_df['topic']
y_test = test_df['topic']

In [10]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

parameters = {
    #'vect__min_df': (1, 3, 5, 11, 17, 31),
    'vect__min_df': (1, 5, 11),
    'vect__max_df': (0.75, 0.85, 1.0),
    #'vect__max_df': (0.65, 0.75, 0.85, 1.0),
    'vect__max_features': (None, 5000, 10000),
    #'vect__max_features': (None, 5000, 10000, 15000, 25000),
    'vect__ngram_range': ((1, 1), (1,3), (2,3), (3,6)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__sublinear_tf': [True],
    #'tfidf__sublinear_tf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__fit_prior': (True, False)
}

grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(parameters)
t0 = time()
grid_search.fit(training_df['article_words'], training_df['topic'])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__min_df': (1, 5, 11), 'vect__max_df': (0.75, 0.85, 1.0), 'vect__max_features': (None, 5000, 10000), 'vect__ngram_range': ((1, 1), (1, 3), (2, 3), (3, 6)), 'tfidf__use_idf': (True, False), 'tfidf__sublinear_tf': [True], 'clf__fit_prior': (True, False)}
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 19.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 42.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 74.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 112.0min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 159.9min
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed: 187.1min finished


done in 11234.100s

Best score: 0.748
Best parameters set:
	clf__fit_prior: False
	tfidf__sublinear_tf: True
	tfidf__use_idf: True
	vect__max_df: 0.75
	vect__max_features: None
	vect__min_df: 5
	vect__ngram_range: (2, 3)


In [11]:
#preds = grid_search.predict(test_df['article_words'])
acc = grid_search.score(test_df['article_words'], test_df['topic'])
print(acc)

0.752


In [53]:
bestimator = Pipeline([
    ('vect', CountVectorizer(ngram_range=(2,3), max_df=0.75, min_df=5, max_features=None)),
    ('tfidf', TfidfTransformer(use_idf=True, sublinear_tf=True)),
    ('clf', MultinomialNB(fit_prior=False)),
])
bestimator.fit(training_df['article_words'], training_df['topic'])
print(bestimator.score(test_df['article_words'], test_df['topic']))

0.752


In [74]:
#data leak
#filter model
def make_rel(x):
    if x != 'IRRELEVANT': return 'RELEVANT'
    else: return x
    
training_rel_df = training_df.copy()
test_rel_df = test_df.copy()
training_rel_df['topic'] = training_rel_df['topic'].apply(make_rel)
test_rel_df['topic'] = test_rel_df['topic'].apply(make_rel)

rel_pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range=(2,3), min_df=6)),
    ('tfidf', TfidfTransformer(use_idf=True, sublinear_tf=True)),
    #('sel', SelectKBest(mutual_info_classif, k=1000)),
    ('clf', MultinomialNB(fit_prior=False))
])
rel_pipe.fit(training_rel_df['article_words'], training_rel_df['topic'])
print(rel_pipe.score(test_rel_df['article_words'], test_rel_df['topic']))
#experiments with feature selection tended to have poorer results
#some studies about this phenomenon in text classification, even weak
#components tend to be usefully informative

0.836


In [77]:
#relevancy differentiability
training_cat_df = training_df.copy()
test_cat_df = test_df.copy()

training_cat_df = training_cat_df[training_cat_df.topic != 'IRRELEVANT']
test_cat_df = test_cat_df[test_cat_df.topic != 'IRRELEVANT']

cat_pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,3), min_df=7)),
    ('tfidf', TfidfTransformer(use_idf=True, sublinear_tf=True)),
    #('sel', SelectKBest(mututal_info_classif, k=100)),
    ('clf', MultinomialNB(alpha=0, fit_prior=False))
])

cat_pipe.fit(training_cat_df['article_words'], training_cat_df['topic'])
print(cat_pipe.score(test_cat_df['article_words'], test_cat_df['topic']))
preds = cat_pipe.predict(test_cat_df['article_words'])
report_df = pd.DataFrame({'True':test_cat_df['topic'], 'Pred':preds})
print(report_df)

  'setting alpha = %.1e' % _ALPHA_MIN)


0.7905982905982906
                                 True                              Pred
2                       FOREX MARKETS                     MONEY MARKETS
5                       FOREX MARKETS                     FOREX MARKETS
7                              SPORTS                            SPORTS
12                             SPORTS                            SPORTS
15                      MONEY MARKETS                     MONEY MARKETS
17                     SHARE LISTINGS                    SHARE LISTINGS
19                             SPORTS                            SPORTS
24                      FOREX MARKETS                     FOREX MARKETS
25   BIOGRAPHIES PERSONALITIES PEOPLE        ARTS CULTURE ENTERTAINMENT
28                      FOREX MARKETS                     MONEY MARKETS
29                      FOREX MARKETS                     FOREX MARKETS
30                      MONEY MARKETS                     MONEY MARKETS
33                      MONEY MARKETS        

In [85]:
training_fin_df = training_df.copy()
test_fin_df = test_df.copy()

training_fin_df = training_fin_df[(training_fin_df.topic == 'FOREX MARKETS') | (training_fin_df.topic == 'MONEY MARKETS')]
test_fin_df = test_fin_df[(test_fin_df.topic == 'FOREX MARKETS') | (test_fin_df.topic == 'MONEY MARKETS')]

def get_text_length(x):
    return np.array([len(t) for t in x]).reshape(-1, 1)

word_pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,3), min_df=7)),
    ('tfidf', TfidfTransformer(use_idf=True, sublinear_tf=True)),
    #('sel', SelectKBest(mututal_info_classif, k=100)),
    #('clf', MultinomialNB(alpha=0, fit_prior=False))
])


fin_pipe = Pipeline([
    ('features', FeatureUnion([
        ('text', word_pipe),
        ('length', Pipeline([
            ('count', FunctionTransformer(get_text_length, validate=False)),
        ]))
    ])),
    ('clf', MultinomialNB(alpha=0, fit_prior=False))])

fin_pipe.fit(training_fin_df['article_words'], training_fin_df['topic'])
print(cat_pipe.score(test_fin_df['article_words'], test_fin_df['topic']))
preds = cat_pipe.predict(test_fin_df['article_words'])
report_df = pd.DataFrame({'True':test_fin_df['topic'], 'Pred':preds})
print(report_df)

0.7094017094017094
              True           Pred
2    FOREX MARKETS  MONEY MARKETS
5    FOREX MARKETS  FOREX MARKETS
15   MONEY MARKETS  MONEY MARKETS
24   FOREX MARKETS  FOREX MARKETS
28   FOREX MARKETS  MONEY MARKETS
29   FOREX MARKETS  FOREX MARKETS
30   MONEY MARKETS  MONEY MARKETS
33   MONEY MARKETS  MONEY MARKETS
38   FOREX MARKETS  FOREX MARKETS
41   MONEY MARKETS  FOREX MARKETS
47   MONEY MARKETS  MONEY MARKETS
49   MONEY MARKETS  MONEY MARKETS
50   FOREX MARKETS  FOREX MARKETS
52   MONEY MARKETS  MONEY MARKETS
53   FOREX MARKETS  MONEY MARKETS
54   MONEY MARKETS  FOREX MARKETS
59   MONEY MARKETS  MONEY MARKETS
64   MONEY MARKETS  FOREX MARKETS
69   FOREX MARKETS  FOREX MARKETS
71   MONEY MARKETS  FOREX MARKETS
76   FOREX MARKETS  FOREX MARKETS
83   FOREX MARKETS  FOREX MARKETS
85   MONEY MARKETS  MONEY MARKETS
86   MONEY MARKETS  MONEY MARKETS
87   FOREX MARKETS  FOREX MARKETS
88   MONEY MARKETS  MONEY MARKETS
93   MONEY MARKETS  MONEY MARKETS
98   MONEY MARKETS  MONEY MAR

  'setting alpha = %.1e' % _ALPHA_MIN)


In [60]:
preds = []
y=test_df['topic']
for text in test_df['article_words']:
    text_wrap = [text]
    pred = rel_pipe.predict(text_wrap)[0]
    if pred == 'RELEVANT': pred = cat_pipe.predict(text_wrap)[0]
    preds.append(pred)
#print(preds[0])
acc = (preds == y).mean()
print(acc)

0.762


In [68]:
#voter
preds = []
y=test_df['topic']
for text in test_df['article_words']:
    text_wrap = [text]
    pred = rel_pipe.predict(text_wrap)[0]
    pred_prob1 = rel_pipe.predict_proba(text_wrap)[0]
    pred_prob1 = max(pred_prob1)
    pred_prob2 = cat_pipe.predict_proba(text_wrap)[0]
    pred_prob2 = max(pred_prob2)
    if pred == 'RELEVANT': pred = cat_pipe.predict(text_wrap)[0]
    #elif pred_prob2 > pred_prob1: print(cat_pipe.predict(text_wrap)[0], pred_prob2, '>', pred_prob1)
    preds.append(pred)
#print(preds[0])
acc = (preds == y).mean()
report_df = pd.DataFrame({'True':y, 'Pred':preds})
print(report_df)
print(acc)
print(classification_report(y, preds))

                                 True                              Pred
0                          IRRELEVANT                        IRRELEVANT
1                          IRRELEVANT                        IRRELEVANT
2                       FOREX MARKETS                     MONEY MARKETS
3                          IRRELEVANT                        IRRELEVANT
4                          IRRELEVANT                        IRRELEVANT
5                       FOREX MARKETS                     FOREX MARKETS
6                          IRRELEVANT                        IRRELEVANT
7                              SPORTS                            SPORTS
8                          IRRELEVANT                     MONEY MARKETS
9                          IRRELEVANT                        IRRELEVANT
10                         IRRELEVANT                        IRRELEVANT
11                         IRRELEVANT                        IRRELEVANT
12                             SPORTS                           

In [8]:
#custom vectorize
#naive scorer model...not really probabilities but class dependent score
corpus_indexer = {}
corpus_index = 0
for sentence in training_df['article_words']:
    words = sentence.split(',')
    for word in words:
        if word not in corpus_indexer:
            corpus_indexer[word] = corpus_index
            corpus_index += 1

print(corpus_index)

#probably dont need this intermediate step/datastructure
class_data = {}
for row in training_df.itertuples():
    words = row.article_words.split(',')
    topic = row.topic
    if topic not in class_data:
        class_data[topic] = []
    doc_vector = np.zeros((corpus_index))
    for word in words:
        index = corpus_indexer[word]
        doc_vector[index] += 1
    class_data[topic].append(doc_vector)

#do numpy stuff
for topic, matrix in class_data.items():
    matrix = np.vstack(matrix)
    matrix = matrix/np.linalg.norm(matrix, axis=1, keepdims=True)
    class_data[topic] = matrix.mean(0)
    
for topic, vector in class_data.items():
    print(vector)
        

35823
[2.38405306e-02 1.17232812e-03 5.92409583e-02 ... 0.00000000e+00
 3.99161691e-05 3.99161691e-05]
[0.01891437 0.00069765 0.05654598 ... 0.         0.         0.        ]
[0.02549916 0.00128358 0.00746048 ... 0.         0.         0.        ]
[7.63176384e-03 5.42080874e-04 2.31130935e-02 ... 2.59036476e-05
 0.00000000e+00 0.00000000e+00]
[0.00783205 0.         0.01235197 ... 0.         0.         0.        ]
[0.00885717 0.00061149 0.01725865 ... 0.         0.         0.        ]
[0.00167749 0.00025989 0.0063773  ... 0.         0.         0.        ]
[0.00936375 0.00098046 0.012028   ... 0.         0.         0.        ]
[0.0129255  0.         0.01757448 ... 0.         0.         0.        ]
[0.004762   0.00035784 0.01321229 ... 0.         0.         0.        ]
[0.01181187 0.00057624 0.0091316  ... 0.         0.         0.        ]


In [9]:
def my_vectorizer(words, indexer, size):
    vec = np.zeros((size))
    for word in words:
        if word not in indexer: continue
        index = indexer[word]
        vec[index] += 1
    return vec/np.linalg.norm(vec, keepdims=True)

def my_classify(model, vec):
    classes = list(model.keys())
    num_classes = len(classes)
    vals = np.zeros((num_classes))
    for i, k in enumerate(classes):
        model_vec = model[k]
        vals[i] = np.sqrt(np.mean((model_vec-vec)**2))
    c = np.argmin(vals)
    return classes[c]

def accuracy(preds, true):
    size = len(preds)
    total = 0
    for i, v in enumerate(true):
        if preds[i] == v: total += 1
    return total/size

preds = []
for sentence in test_df['article_words']:
    words = sentence.split(',')
    vec = my_vectorizer(words, corpus_indexer, corpus_index)
    preds.append(my_classify(class_data, vec))

acc = accuracy(preds, test_df['topic'])
print(acc)
    

0.636


In [7]:
#a bit confused how test data is transformed...is term frequency within a doc calculated for test data
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(training_df['article_words'])
test_features = vectorizer.transform(test_df['article_words'])
classifier = MultinomialNB()
classifier.fit(train_features, y_train)
predictions = classifier.predict(test_features)
acc = np.mean(predictions == y_test)
print(acc)

0.68


In [8]:
#a bit confused how test data is transformed...is term frequency within a doc calculated for test data
vectorizer = TfidfVectorizer(ngram_range=(1,2))
train_features = vectorizer.fit_transform(training_df['article_words'])
test_features = vectorizer.transform(test_df['article_words'])
classifier = MultinomialNB()
classifier.fit(train_features, y_train)
predictions = classifier.predict(test_features)
acc = np.mean(predictions == y_test)
print(acc)

0.642


In [9]:
#a bit confused how test data is transformed...is term frequency within a doc calculated for test data
vectorizer = TfidfVectorizer(ngram_range=(2,2))
train_features = vectorizer.fit_transform(training_df['article_words'])
test_features = vectorizer.transform(test_df['article_words'])
classifier = MultinomialNB()
classifier.fit(train_features, y_train)
predictions = classifier.predict(test_features)
acc = np.mean(predictions == y_test)
print(acc)

0.646


In [10]:
#a bit confused how test data is transformed...is term frequency within a doc calculated for test data
vectorizer = TfidfVectorizer(ngram_range=(1,5))
train_features = vectorizer.fit_transform(training_df['article_words'])
test_features = vectorizer.transform(test_df['article_words'])
classifier = MultinomialNB()
classifier.fit(train_features, y_train)
predictions = classifier.predict(test_features)
acc = np.mean(predictions == y_test)
print(acc)

0.636


In [11]:
#a bit confused how test data is transformed...is term frequency within a doc calculated for test data
vectorizer = TfidfVectorizer(ngram_range=(10,10))
train_features = vectorizer.fit_transform(training_df['article_words'])
test_features = vectorizer.transform(test_df['article_words'])
classifier = MultinomialNB()
classifier.fit(train_features, y_train)
predictions = classifier.predict(test_features)
acc = np.mean(predictions == y_test)
print(acc)

0.554


In [13]:
#might be good to get rid of irrelevant class...bias towards it especially since large class
#term or document frequency?
#a bit confused how test data is transformed...is term frequency within a doc calculated for test data
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(training_df['article_words'])
test_features = vectorizer.transform(test_df['article_words'])
classifier = MultinomialNB()
classifier.fit(train_features, y_train)
predictions = classifier.predict(test_features)
acc = np.mean(predictions == y_test)
print(acc)

0.728


In [14]:
vectorizer = CountVectorizer(ngram_range=(1,2))
train_features = vectorizer.fit_transform(training_df['article_words'])
test_features = vectorizer.transform(test_df['article_words'])
classifier = MultinomialNB()
classifier.fit(train_features, y_train)
predictions = classifier.predict(test_features)
acc = np.mean(predictions == y_test)
print(acc)

0.734


In [16]:
vectorizer = CountVectorizer(ngram_range=(1,3))
train_features = vectorizer.fit_transform(training_df['article_words'])
test_features = vectorizer.transform(test_df['article_words'])
classifier = MultinomialNB()
classifier.fit(train_features, y_train)
predictions = classifier.predict(test_features)
acc = np.mean(predictions == y_test)
print(acc)

0.732


In [15]:
vectorizer = CountVectorizer(ngram_range=(1,5))
train_features = vectorizer.fit_transform(training_df['article_words'])
test_features = vectorizer.transform(test_df['article_words'])
classifier = MultinomialNB()
classifier.fit(train_features, y_train)
predictions = classifier.predict(test_features)
acc = np.mean(predictions == y_test)
print(acc)

0.73


In [17]:
vectorizer = CountVectorizer(ngram_range=(1,1))
train_features = vectorizer.fit_transform(training_df['article_words'])
test_features = vectorizer.transform(test_df['article_words'])
classifier = MultinomialNB(fit_prior=False)
classifier.fit(train_features, y_train)
predictions = classifier.predict(test_features)
acc = np.mean(predictions == y_test)
print(acc)

0.73


In [18]:
vectorizer = CountVectorizer(ngram_range=(1,2))
train_features = vectorizer.fit_transform(training_df['article_words'])
test_features = vectorizer.transform(test_df['article_words'])
classifier = MultinomialNB(fit_prior=False)
classifier.fit(train_features, y_train)
predictions = classifier.predict(test_features)
acc = np.mean(predictions == y_test)
print(acc)

0.736


In [19]:
vectorizer = CountVectorizer(ngram_range=(1,3))
train_features = vectorizer.fit_transform(training_df['article_words'])
test_features = vectorizer.transform(test_df['article_words'])
classifier = MultinomialNB(fit_prior=False)
classifier.fit(train_features, y_train)
predictions = classifier.predict(test_features)
acc = np.mean(predictions == y_test)
print(acc)

0.734


In [22]:
vectorizer = TfidfVectorizer(ngram_range=(1,3), use_idf=False, sublinear_tf=True)
train_features = vectorizer.fit_transform(training_df['article_words'])
test_features = vectorizer.transform(test_df['article_words'])
classifier = MultinomialNB()
classifier.fit(train_features, y_train)
predictions = classifier.predict(test_features)
acc = np.mean(predictions == y_test)
print(acc)

0.6


In [None]:
#work still needs to be done with the actual feature metric being used some combination of its frequency within a document and the differentiability of that word in a class compared to others...that might be what feature selection does

In [None]:
#using feature selection