Transforming training set words to features with TF-IDF values

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import csv
import numpy as np

df=pd.read_csv('training.csv')

vectorizer = TfidfVectorizer()

#give each word a tf-idf value
train_features = vectorizer.fit_transform(df['article_words'])

print(train_features.shape)

# Set topics for the training set
y_train = df['topic']

print(y_train)


(9500, 35822)
0       FOREX MARKETS
1       MONEY MARKETS
2              SPORTS
3       FOREX MARKETS
4          IRRELEVANT
            ...      
9495          DEFENCE
9496       IRRELEVANT
9497    FOREX MARKETS
9498       IRRELEVANT
9499    FOREX MARKETS
Name: topic, Length: 9500, dtype: object


Transforming test set words to features with TF-IDF values

In [2]:
df_test=pd.read_csv('test.csv')

#give each word a tf-idf value
test_features = vectorizer.transform(df_test['article_words'])

print(test_features.shape)

# Set topics for the test set
y_test = df_test['topic']

print(y_test)

(500, 35822)
0          IRRELEVANT
1          IRRELEVANT
2       FOREX MARKETS
3          IRRELEVANT
4          IRRELEVANT
            ...      
495        IRRELEVANT
496            SPORTS
497     MONEY MARKETS
498    SHARE LISTINGS
499        IRRELEVANT
Name: topic, Length: 500, dtype: object


Assigning the best estimator as the new SGD Classifier with the best estimated parameters

In [6]:
from sklearn.linear_model import SGDClassifier

# Assigning the existing saved SGD classifier best_estimator with the best parameters
SGD_best_estimator = SGDClassifier(alpha=0.0001, average=False, class_weight=None, early_stopping=False, epsilon=0.1, eta0=0.05, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5, random_state=None, shuffle=False, tol=1e-05, validation_fraction=0.1, verbose=0, warm_start=False)
SGD_best_estimator.fit(train_features, y_train)
SGD_prediction = SGD_best_estimator.predict(test_features)

Assigning the best classifier with the best accuracy from the pipeline to calculate the top 10 articles

Calculating the probability of each words in each topic

In [7]:
from sklearn.calibration import CalibratedClassifierCV

# Using CalibratedClassifierCV to find the probabilities
clf_sigmoid = CalibratedClassifierCV(SGD_best_estimator, cv='prefit', method='sigmoid')
clf_sigmoid.fit(train_features, y_train)

# Setting the targets names representative of each topics
target_names = ["ARTS CULTURE ENTERTAINMENT", "BIOGRAPHIES PERSONALITIES PEOPLE", "DEFENCE", "DOMESTIC MARKETS", "FOREX MARKETS", "HEALTH", "IRRELEVANT", "MONEY MARKETS", "SCIENCE AND TECHNOLOGY", "SHARE LISTINGS", "SPORTS"] 

# Printing the Data Frame
pd.DataFrame(clf_sigmoid.predict_proba(test_features)*100, columns=target_names)

Unnamed: 0,ARTS CULTURE ENTERTAINMENT,BIOGRAPHIES PERSONALITIES PEOPLE,DEFENCE,DOMESTIC MARKETS,FOREX MARKETS,HEALTH,IRRELEVANT,MONEY MARKETS,SCIENCE AND TECHNOLOGY,SHARE LISTINGS,SPORTS
0,0.106254,0.021734,0.065287,0.133561,1.122663,0.004055,92.838555,1.367539,0.041650,4.262008,0.036693
1,0.860383,0.706230,0.029836,0.126760,0.323635,0.040911,94.569626,2.809255,0.048419,0.045931,0.439014
2,0.056909,0.000887,0.008620,0.004900,36.067542,0.007655,10.562073,53.263200,0.009768,0.015474,0.002972
3,0.339104,0.137142,0.085926,1.057690,2.915370,0.072759,93.852634,1.419822,0.093724,0.022768,0.003061
4,0.023413,0.003723,0.009620,0.014383,2.075516,0.004928,97.492735,0.342862,0.003416,0.026852,0.002551
...,...,...,...,...,...,...,...,...,...,...,...
495,3.067187,0.019736,0.141322,0.103955,0.887580,0.027521,94.277181,1.228854,0.042504,0.196921,0.007240
496,0.032017,0.171810,0.033454,0.044450,0.663983,0.021866,1.131619,1.375483,0.043775,0.015573,96.465970
497,0.006918,0.009650,0.001584,0.001757,0.541179,0.009328,0.769880,98.650153,0.007395,0.000898,0.001259
498,0.051044,0.045523,0.041253,0.106685,1.665536,0.013085,78.516290,5.668029,0.047228,13.738685,0.106640


Calculating and returning up to 10 recommended articles for each topic

In [8]:
# Transpose list so that 1st dimension: topics, 2nd dimension: probabilities
proba=list(map(list,zip(*(clf_sigmoid.predict_proba(test_features)*100))))

# Temporary list for top 10 articles indiscriminate of predicted topic
tempsort=[]

# List for top 10 articles discriminate of predicted topic
psort=[[],[],[],[],[],[],[],[],[],[],[]]

# For every topic
for i in range(11):

# Create list of indexes as article numbers
    keys=list(range(9501, 10001))

# Group article numbers and their probabilities to create tuples
# Sort the tuples in descending order based on probability
# Append to the indiscriminate list
    tempsort.append(sorted(list(zip(keys,proba[i])), key=lambda x: x[1], reverse=True))

# Crop list to only include the top 10
    tempsort[i]=tempsort[i][:10]

# For articles in top 10
    for j in tempsort[i]:

# Check if the article is actually predicted to be in this topic
        if best_clf_prediction[j[0] - 9501] == target_names[i]:

# If so, add to the discriminate list
            psort[i].append(j)

# Display list
    display(pd.DataFrame(psort[i], columns=["Article",target_names[i]]))

Unnamed: 0,Article,ARTS CULTURE ENTERTAINMENT
0,9830,87.363606
1,9952,85.058519
2,9789,84.435152
3,9703,81.475318
4,9933,60.237468
5,9526,55.26365


Unnamed: 0,Article,BIOGRAPHIES PERSONALITIES PEOPLE
0,9940,92.034327
1,9988,89.235791
2,9878,70.989828


Unnamed: 0,Article,DEFENCE
0,9616,94.861336
1,9559,94.682608
2,9842,87.695888
3,9670,84.835708
4,9576,84.811624
5,9773,81.978559
6,9770,73.935499
7,9607,61.216782


Unnamed: 0,Article,DOMESTIC MARKETS
0,9994,64.763037
1,9640,58.570165


Unnamed: 0,Article,FOREX MARKETS
0,9551,95.60237
1,9588,95.130405
2,9682,95.130405
3,9632,93.322181
4,9798,91.03548
5,9986,89.287773
6,9772,88.203763
7,9786,87.535848
8,9529,87.52215
9,9671,86.736392


Unnamed: 0,Article,HEALTH
0,9661,90.541385
1,9873,89.320152
2,9833,86.362488
3,9926,86.13014
4,9929,83.484343
5,9947,80.922672
6,9609,75.65648
7,9621,73.19099
8,9911,71.973257
9,9978,71.799161


Unnamed: 0,Article,IRRELEVANT
0,9624,99.084124
1,9958,99.076062
2,9884,99.012886
3,9907,98.951218
4,9925,98.78782
5,9686,98.711189
6,9914,98.586994
7,9932,98.489552
8,9785,98.465641
9,9652,98.405069


Unnamed: 0,Article,MONEY MARKETS
0,9618,99.445103
1,9871,99.206472
2,9755,98.742538
3,9761,98.721673
4,9998,98.650153
5,9835,97.723138
6,9769,97.500945
7,9602,97.486844
8,9840,97.424703
9,9707,97.298256


Unnamed: 0,Article,SCIENCE AND TECHNOLOGY
0,9617,81.254089
1,9982,72.154342


Unnamed: 0,Article,SHARE LISTINGS
0,9518,92.200551
1,9601,76.635601
2,9715,72.172476
3,9666,55.55448
4,9668,16.989003


Unnamed: 0,Article,SPORTS
0,9857,97.710063
1,9760,97.35093
2,9848,97.261172
3,9922,97.258643
4,9569,97.127025
5,9574,97.037861
6,9997,96.46597
7,9849,96.416156
8,9787,96.373563
9,9886,96.370285


Printing the Classification Report for the best classifier

In [9]:
from sklearn.metrics import classification_report

print(classification_report(y_test, SGD_prediction, target_names=target_names))

                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.33      0.67      0.44         3
BIOGRAPHIES PERSONALITIES PEOPLE       1.00      0.20      0.33        15
                         DEFENCE       0.88      0.54      0.67        13
                DOMESTIC MARKETS       0.50      0.50      0.50         2
                   FOREX MARKETS       0.41      0.27      0.33        48
                          HEALTH       0.69      0.64      0.67        14
                      IRRELEVANT       0.86      0.89      0.87       266
                   MONEY MARKETS       0.52      0.68      0.59        69
          SCIENCE AND TECHNOLOGY       0.00      0.00      0.00         3
                  SHARE LISTINGS       0.40      0.29      0.33         7
                          SPORTS       0.95      0.97      0.96        60

                        accuracy                           0.76       500
                       macro avg    