In [1]:
## importing all necessary modules

from nltk.corpus import stopwords
from nltk import pos_tag
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import pandas as pd
import numpy as np
import pickle

In [2]:
## loading both the datasets obtained previously

data = pd.read_csv('data3.csv')
comment_data = pd.read_csv('data4.csv')

## Utility Functions for preprocessing and cleaning data

In [3]:
## creating a list of stopwords and punctuations

stop = stopwords.words("english")
punctuations = list(string.punctuation)
stop = stop + punctuations ## to remove punctuations

In [4]:
## Removing bad symbols

def preprocess(text):
    text = text.str.replace("//", " ")
    text = text.str.replace("/", " ")
    text = text.str.replace(":", " ")
    text = text.str.replace("'\'", " ")
    text = text.str.replace("'", " ")
    text = text.str.replace('[',  " ")
    text = text.str.replace('='," ")
    text = text.str.replace(']'," ")
    text = text.str.replace('['," ")
    text = text.str.replace(')', ' ')
    text = text.str.replace('(', ' ')
    text = text.str.replace('\\n', ' ')
    text = text.str.replace('\\t', ' ')
    text = text.str.replace('\\', ' ')
    text = text.str.replace('@', ' ')  
    text = text.str.replace('<', ' ') 
    text = text.str.replace('>', ' ') 

    return text


In [5]:
## creating an object for lemmatization

lt = WordNetLemmatizer()

In [6]:
## fucntion to get pos_tag of a word

def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [7]:
## Function to remove emoji's from the text

def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

In [8]:
## combining the above three functions 
## cleaning words
## removing stop words, punctuations, emojis
## performing lemmatization on the words

def clean_review(words):
    output_words = []
    for w in words:
        w = deEmojify(w) # removing emojis
        if w.lower() not in stop: # removing stopwords
            try:
                pos = pos_tag([w]) # finding the pos tag of the word
                clean_word = lt.lemmatize(w,pos = get_simple_pos(pos[0][1])) # lemmatizing the word
                output_words.append(clean_word.lower()) # adding the word in lower case as cleaned word
            except:
                continue
    return np.array(output_words) # returning the array of cleaned words

# USING TITLE

In [9]:
## loading the text and labels y from our dataframe

text = np.array(data["title"])
y = np.array(data["flair"])
text.shape, y.shape

((2552,), (2552,))

In [10]:
train_text = text
y_train = y

In [11]:
## splitting the data into 2 parts, one for training and one for testing the performance of the classifier

train_text, test_text, y_train, y_test = train_test_split(text, y, random_state = 1)
train_text.shape, y_train.shape

((1914,), (1914,))

In [12]:
## forming training documents with each entry as a tuple containing text and the respective flair
## ie. documents = [('jhd odjsd jfs .. ', 'AskIndia'), ('dsjd...', 'Politics') ..]

documents = [] ## training documents
for i in range(len(y_train)):
    documents.append((train_text[i].split(" "),y_train[i]))
documents[0]

(['Some', 'mind', 'blowing', 'facts', 'about', 'Manmohan', 'Singh'],
 'Science/Technology')

In [13]:
## cleaning the training data, removing stopwords, punctuations, emojis, and lemmatizing the words

documents = [(clean_review(i),category) for i,category in documents]

In [14]:
documents[0] # a look at how the first entry looks like

(array(['mind', 'blowing', 'fact', 'manmohan', 'singh'], dtype='<U8'),
 'Science/Technology')

In [15]:
## now forming x_train and y_train from the cleaned data(ie documents)

y_train = [category for words,category in documents]
x_train = [" ".join(words) for words, categories in documents] # joining the cleaned words to form text for training

In [16]:
## forming sparse matrix of x_train and x_test using TfidfVectorizer which can be directly used as input in sklearn classifiers

token_vec = TfidfVectorizer(max_features = 3000, ngram_range = (1,3))
x_train = token_vec.fit_transform(x_train) ## we can use the sparse matrix directyly as training and testing data in our sklearn classifiers


In [17]:
## transforming x_test into a sparse matrix

x_test = token_vec.transform(test_text)

In [18]:
## data is now ready for being used in sklearn classifiers
## x_train, y_train, x_test, y_test

### Classification using different classifiers:

1. Multinomial Naive Bayes
2. Logistic Regression
3. Random Forests
4. Linear SVM


### Multinomial Naive Bayes

In [19]:
clf1 = MultinomialNB(alpha = 2)
clf1.fit(x_train,y_train)

MultinomialNB(alpha=2, class_prior=None, fit_prior=True)

In [20]:
clf1.score(x_test, y_test)

0.6520376175548589

### Logistic Regression

In [21]:
clf2 = LogisticRegression(C = 0.5)
clf2.fit(x_train,y_train)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
clf2.score(x_test, y_test)

0.6959247648902821

### Random Forest Classifier

In [23]:
clf3 = RandomForestClassifier() 
clf3.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [24]:
clf3.score(x_test, y_test)

0.6865203761755486

### Linear SVM

In [25]:
clf4 = SVC()

In [26]:
clf4 = SVC()
clf4.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [27]:
clf4.score(x_test, y_test)

0.6551724137931034

In [28]:
## I have combined all the above steps till obtaining the sparse matrices for our data into a single function get_data
## in order to avoid redundancy of code

In [14]:
## function to return the data in the required form as sparse martices to be used in sklearn classifiers
def get_data(data, key):
    
    #data[key] = preprocess(data[key]) ## preprocessing the data by removing bad symbols etc
    #print(data[key])
    #data.dropna(inplace = True) ## removing rows having nan values from the dataframe
    text = np.array(data[key])
    y = np.array(data['flair'])
    train_text, test_text, y_train, y_test = train_test_split(text, y, random_state = 1) ## splitting the data into train and test
    documents = [] ## forming training documents with each entry as a tuple containing text and the respective flair
    ## ie. documents = [('jhd odjsd jfs .. ', 'AskIndia'), ('dsjd...', 'Politics') ..]
    for i in range(len(y_train)):
        documents.append((train_text[i].split(" "),y_train[i]))
    ## cleaning the training documents 
    documents = [(clean_review(i),category) for i,category in documents] ## clean review function defined above
    print(documents[0])
    ## now forming x_train and y_train from the cleaned data(ie documents)
    y_train = [category for words,category in documents]
    x_train = [" ".join(words) for words, categories in documents] ## joining the cleaned words of the documents
    ## forming sparse matrix of x_train and x_test using TfidfVectorizer
    ## creating an object of TfidfVectorizer with 3000 words as features and an ngram range of (1,3)
    token_vec = TfidfVectorizer(max_features = 3000, ngram_range = (1,3))
    x_train = token_vec.fit_transform(x_train) 
    x_test = token_vec.transform(test_text)
    return x_train, x_test, y_train, y_test, token_vec

# Using URL 

In [30]:
## New preprocessing function for removing bad symbols from a url

def preprocess(text):
    text = text.str.replace("//", " ")
    text = text.str.replace('.', ' ')
    text = text.str.replace('https:', ' ')
    text = text.str.replace('_', ' ')
    text = text.str.replace('-', ' ')
    text = text.str.replace("/", " ")
    text = text.str.replace("'\'", " ")
    text = text.str.replace("'", " ")
    text = text.str.replace('[',  " ")
    text = text.str.replace('='," ")
    text = text.str.replace(']'," ")
    text = text.str.replace('['," ")
    text = text.str.replace(')', ' ')
    text = text.str.replace('(', ' ')
    text = text.str.replace('\\n', ' ')
    text = text.str.replace('\\t', ' ')
    text = text.str.replace('\\', ' ')
    text = text.str.replace('@', ' ')  
    text = text.str.replace('<', ' ') 
    text = text.str.replace('>', ' ') 
    text = text.str.replace("'", ' ') 


    return text


In [31]:
data['url'] = preprocess(data['url'])

In [32]:
x_train, x_test, y_train, y_test, token_vec = get_data(data, 'url')

(array(['www', 'reddit', 'com', 'r', 'india', 'comment', 'd9fh9n', 'mind',
       'blowing', 'fact', 'manmohan', 'singh'], dtype='<U8'), 'Science/Technology')


In [33]:
data['url'][0]

'  www telegraphindia com india coronavirus outbreak untouchability even in quarantine cid 1764186'

### Multinomial Naive Bayes

In [34]:
clf1 = MultinomialNB(alpha = 2)
clf1.fit(x_train,y_train)

MultinomialNB(alpha=2, class_prior=None, fit_prior=True)

In [35]:
clf1.score(x_test, y_test)

0.4169278996865204

### Logistic Regression

In [36]:
clf2 = LogisticRegression()
clf2.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
clf2.score(x_test, y_test)

0.47648902821316613

### Random Forest Classifier

In [38]:
clf3 = RandomForestClassifier()
clf3.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [39]:
clf3.score(x_test, y_test)

0.4670846394984326

### Linear Support Vector Machines (SVM)

In [40]:
clf4 = SVC()
clf4.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [41]:
clf4.score(x_test, y_test)

0.4608150470219436

## USING COMMENTS

In [42]:
comment_data = pd.read_csv('data4.csv')
comment_data

Unnamed: 0,flair,body
0,Scheduled,"Let them feel hungry for a couple of days, max..."
1,Scheduled,This is beyond petty.> The inclusion of a Delh...
2,Scheduled,"My hunch is , this guy is trying to expose the..."
3,Scheduled,Muslims and reservation are two distractions u...
4,Scheduled,"Bachega India, tabhi toh Padhega India.Gand ma..."
...,...,...
2547,Coronavirus,
2548,Coronavirus,They did a FAB job! Hats off to the whole team.
2549,Coronavirus,Currently seeing the numbers in Mumbai i don't...
2550,Coronavirus,Yeah u need a skullcap to be virus here /sHis ...


In [43]:
comment_data.dropna(inplace = True)
comment_data

Unnamed: 0,flair,body
0,Scheduled,"Let them feel hungry for a couple of days, max..."
1,Scheduled,This is beyond petty.> The inclusion of a Delh...
2,Scheduled,"My hunch is , this guy is trying to expose the..."
3,Scheduled,Muslims and reservation are two distractions u...
4,Scheduled,"Bachega India, tabhi toh Padhega India.Gand ma..."
...,...,...
2544,Coronavirus,"> Kabootar or Kachori, no one will be spared. ..."
2545,Coronavirus,Most of those isolation wards are dangerous wh...
2548,Coronavirus,They did a FAB job! Hats off to the whole team.
2549,Coronavirus,Currently seeing the numbers in Mumbai i don't...


In [44]:
comment_data['body'] = preprocess(comment_data['body'])

In [45]:
x_train, x_test, y_train, y_test, token_vec = get_data(comment_data, 'body')

(array(['time', 'ritual', 'smellwaiting', 'next', 'task', 'declare',
       'whatever', 'want', 'declare', '1', '2', 'day', 'before?', 'last',
       'minute', 'declaration', 'cause', 'chaos', 'hope', 'red', 'orange',
       'green', 'thing', '10am', '8pm', 'might', 'relax', 'condition',
       'make', 'life', 'easy', 'non', 'impact', 'region', 'ramayan', 'ke',
       'baad', 'siddha', 'modiji', 'ke', 'darshanany', 'guess', 'would',
       'new', 'task', '?tn', 'already', 'extend', 'lockdown', 'till',
       'april', '30', 'tweet', 'come', 'tweet', 'twitter', 'com',
       'pmoindia', 'status', '1249620775679610880?s', '21', 'bet',
       'would', 'full', 'lockdown', 'extend', 'behalf', 'shift',
       'decision', 'cm', 'respectively', 'state', 'bjp', 'shame', 'later',
       'oppose', 'party', 'state', 'come', 'directly', 'tv,', 'first',
       'announcement', 'give', 'time', 'everyone', 'glue', 'tv', 'poora',
       'tamasha', 'banaa', 'diya', 'hai', 'fir', 'shuru', 'tamasha',
      

### Multinomial Naive Bayes

In [46]:
clf1 = MultinomialNB(alpha = 2)
clf1.fit(x_train,y_train)

MultinomialNB(alpha=2, class_prior=None, fit_prior=True)

In [47]:
clf1.score(x_test, y_test)

0.30139372822299654

### Logistic Regression

In [48]:
clf2 = LogisticRegression(tol = 0.00001, max_iter = 100)

In [49]:
clf2.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=1e-05, verbose=0,
                   warm_start=False)

In [50]:
clf2.score(x_test, y_test)

0.4146341463414634

### Random Forest Classifier

In [51]:
clf3 = RandomForestClassifier()

In [52]:
clf3.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [53]:
clf3.score(x_test, y_test)

0.44076655052264807

### Linear SVM

In [54]:
clf4 = SVC()

In [55]:
clf4.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [56]:
clf4.score(x_test, y_test)

0.31881533101045295

## TITLE + URL

In [57]:
## Loading both the datasets

data1 = pd.read_csv('data3.csv')


In [58]:
data1['url'] = preprocess(data1['url'])

In [59]:
## forming a new dataframe containing both title of the reddit post along with the text of top comments

df = pd.DataFrame(data1['title'] + data1['url'], columns = ['title + url'])
df['flair'] = data1['flair']

In [60]:
df

Unnamed: 0,title + url,flair
0,"Untouchability, even in quarantine. 'We have n...",Scheduled
1,Delhi Govt Sources: Names of CM Arvind Kejriwa...,Scheduled
2,"Delhi: AP Singh, advocate of 2012 Delhi gang-r...",Scheduled
3,Why the Supreme Court’s verdict on SC/ST quota...,Scheduled
4,What about the entrance exams scheduled in May...,Scheduled
...,...,...
2547,Coronavirus: The race to stop the virus spread...,Coronavirus
2548,Explained: The ‘Bhilwara model’ of ‘ruthless c...,Coronavirus
2549,India to extend lockdown by 2 weeks www reddi...,Coronavirus
2550,My name is Chang and I am not coronavirus you...,Coronavirus


In [61]:
x_train, x_test, y_train, y_test, token_vec = get_data(df, 'title + url')

(array(['mind', 'blowing', 'fact', 'manmohan', 'singh', 'www', 'reddit',
       'com', 'r', 'india', 'comment', 'd9fh9n', 'mind', 'blowing',
       'fact', 'manmohan', 'singh'], dtype='<U8'), 'Science/Technology')


### Multinomial Naive Bayes

In [62]:
clf1 = MultinomialNB(alpha = 2)
clf1.fit(x_train,y_train)

MultinomialNB(alpha=2, class_prior=None, fit_prior=True)

In [63]:
clf1.score(x_test, y_test)

0.603448275862069

### Logistic Regression

In [64]:
clf2 = LogisticRegression(C = 2.5)
clf2.fit(x_train,y_train)

LogisticRegression(C=2.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [65]:
clf2.score(x_test, y_test)

0.6755485893416928

### Random Forest Classifier

In [66]:
clf3 = RandomForestClassifier()
clf3.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [67]:
clf3.score(x_test, y_test)

0.6755485893416928

### Linear SVM

In [68]:
clf4 = SVC(C=4)
clf4.fit(x_train, y_train)

SVC(C=4, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [69]:
clf4.score(x_test, y_test)

0.6551724137931034

## COMMENTS+TITLE

Now I am using both comments and titles as text to train my classifier

In [9]:
## Loading both the datasets

data1 = pd.read_csv('data3.csv')
data2 = pd.read_csv('data4.csv')

In [10]:
## forming a new dataframe containing both title of the reddit post along with the text of top comments

df = pd.DataFrame(data1['title'] + data2['body'], columns = ['title + comments'])
df['flair'] = data1['flair']

In [11]:
## Removing the bad symbols

df["title + comments"] = preprocess(df['title + comments'])

In [12]:
## Removing NaN values

df.dropna(inplace = True)
df

Unnamed: 0,title + comments,flair
0,"Untouchability, even in quarantine. We have n...",Scheduled
1,Delhi Govt Sources Names of CM Arvind Kejriwa...,Scheduled
2,"Delhi AP Singh, advocate of 2012 Delhi gang-r...",Scheduled
3,Why the Supreme Court’s verdict on SC ST quota...,Scheduled
4,What about the entrance exams scheduled in May...,Scheduled
...,...,...
2544,Coronavirus lockdown 2 cops on duty injured i...,Coronavirus
2545,Why are patients fleeing India’s coronavirus i...,Coronavirus
2548,Explained The ‘Bhilwara model’ of ‘ruthless c...,Coronavirus
2549,India to extend lockdown by 2 weeksCurrently s...,Coronavirus


In [15]:
x_train, x_test, y_train, y_test, token_vec = get_data(df, 'title + comments')

(array(['coronavirus', 'lockdown', 'pm', 'narendra', 'modi', 'address',
       'nation', '10', 'tomorrow', 'amid', 'state', 'demand', 'extend',
       'lockdown', 'till', 'april', '30it', 'time', 'ritual',
       'smellwaiting', 'next', 'task.why', 'declare', 'whatever', 'want',
       'declare', '1-2', 'day', 'before?', 'last', 'minute',
       'declaration', 'cause', 'chaos.', 'hope', 'red-orange-green',
       'thing.10am', '8pm', 'might', 'relax', 'condition', 'make', 'life',
       'easy', 'non', 'impact', 'regions.ramayan', 'ke', 'baad', 'siddha',
       'modiji', 'ke', 'darshanany', 'guess', 'would', 'new', 'task',
       '?tn', 'already', 'extend', 'lockdown', 'till', 'april', '30',
       'tweet', 'came.', 'tweet', 'http', 'twitter.com', 'pmoindia',
       'status', '1249620775679610880?s', '21', 'bet', 'would', 'full',
       'lockdown', 'extend', 'behalf', 'shift', 'decision', 'cm',
       'respectively', 'state', 'bjp', 'shame', 'later', 'oppose',
       'party', 'states.he

In [None]:
pickle.dump(token_vec, open('transform1.pkl', 'wb'))

## Multinomial Naive Bayes

In [76]:
clf1 = MultinomialNB(alpha = 2)
clf1.fit(x_train,y_train)

MultinomialNB(alpha=2, class_prior=None, fit_prior=True)

In [77]:
clf1.score(x_test, y_test)

0.44947735191637633

## Logistic Regression

In [78]:
clf2 = LogisticRegression(tol = 0.00001, max_iter = 100)
clf2.fit(x_train, y_train)
clf2.score(x_test, y_test)

0.6533101045296167

## Random Forests Classifier (Chosen Classifier)

In [16]:
clf3 = RandomForestClassifier(n_estimators = 150, criterion = 'gini')
clf3.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [80]:
pickle.dump(clf3, open('model1.pkl', 'wb'))

In [17]:
clf3.score(x_test, y_test)

0.7473867595818815

In [18]:
y_pred = clf3.predict(x_test)

In [83]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.68      0.75      0.71        61
  Business/Finance       0.62      0.67      0.65        60
       Coronavirus       0.75      0.84      0.79        45
              Food       0.69      0.84      0.76        50
     Non-Political       1.00      0.94      0.97        53
       Photography       0.97      0.66      0.79        53
    Policy/Economy       0.68      0.54      0.60        50
          Politics       0.60      0.69      0.64        51
         Scheduled       0.63      0.55      0.59        44
Science/Technology       0.73      0.56      0.64        39
            Sports       0.74      0.85      0.79        68

          accuracy                           0.73       574
         macro avg       0.74      0.72      0.72       574
      weighted avg       0.74      0.73      0.73       574

[[46  2  3  2  0  0  1  3  2  0  2]
 [ 4 40  1  0  0  0  6  3  1  2  3]
 [ 0  0 38  3  0  0  1  0

In [19]:
from sklearn.metrics import f1_score

In [26]:
f1_score(y_test, y_pred, average = "weighted")

0.748739883600557

### Linear SVM

In [84]:
clf4 = SVC()
clf4.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [85]:
clf4.score(x_test, y_test)

0.5958188153310104

# COMMENT + TITLE + URL

In [86]:
## Loading both the datasets

data1 = pd.read_csv('data3.csv')
data2 = pd.read_csv('data4.csv')


In [87]:
## forming a new dataframe containing both title of the reddit post along with the text of top comments

df = pd.DataFrame(data1['title'] + data1['url'] + data2['body'], columns = ['title + comments + url'])
df['flair'] = data1['flair']

In [88]:
## Removing the bad symbols

df["title + comments + url"] = preprocess(df['title + comments + url'])

In [89]:
## Removing NaN values

df.dropna(inplace = True)
df

Unnamed: 0,title + comments + url,flair
0,"Untouchability, even in quarantine We have n...",Scheduled
1,Delhi Govt Sources: Names of CM Arvind Kejriwa...,Scheduled
2,"Delhi: AP Singh, advocate of 2012 Delhi gang r...",Scheduled
3,Why the Supreme Court’s verdict on SC ST quota...,Scheduled
4,What about the entrance exams scheduled in May...,Scheduled
...,...,...
2544,Coronavirus lockdown: 2 cops on duty injured i...,Coronavirus
2545,Why are patients fleeing India’s coronavirus i...,Coronavirus
2548,Explained: The ‘Bhilwara model’ of ‘ruthless c...,Coronavirus
2549,India to extend lockdown by 2 weeks www reddi...,Coronavirus


In [90]:
x_train, x_test, y_train, y_test, token_vec = get_data(df, 'title + comments + url')

(array(['coronavirus', 'lockdown:', 'pm', 'narendra', 'modi', 'address',
       'nation', '10', 'tomorrow', 'amid', 'state', 'demand', 'extend',
       'lockdown', 'till', 'april', '30', 'www', 'ndtv', 'com', 'india',
       'news', 'coronavirus', 'lockdown', 'pm', 'narendra', 'modi',
       'address', 'nation', '10', 'tomorrow', 'amid', 'state', 'demand',
       'exte', '2210864it', 'time', 'ritual', 'smellwaiting', 'next',
       'task', 'declare', 'whatever', 'want', 'declare', '1', '2', 'day',
       'before?', 'last', 'minute', 'declaration', 'cause', 'chaos',
       'hope', 'red', 'orange', 'green', 'thing', '10am', '8pm', 'might',
       'relax', 'condition', 'make', 'life', 'easy', 'non', 'impact',
       'region', 'ramayan', 'ke', 'baad', 'siddha', 'modiji', 'ke',
       'darshanany', 'guess', 'would', 'new', 'task', '?tn', 'already',
       'extend', 'lockdown', 'till', 'april', '30', 'tweet', 'come',
       'tweet', 'twitter', 'com', 'pmoindia', 'status',
       '12496207756

## Multinomial Naive Bayes

In [91]:
clf1 = MultinomialNB(alpha = 2)
clf1.fit(x_train,y_train)

MultinomialNB(alpha=2, class_prior=None, fit_prior=True)

In [92]:
clf1.score(x_test, y_test)

0.49477351916376305

## Logistic Regression

In [93]:
clf2 = LogisticRegression()
clf2.fit(x_train, y_train)
clf2.score(x_test, y_test)

0.6968641114982579

## Random Forests Classifier

In [94]:
clf3 = RandomForestClassifier()
clf3.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [95]:
clf3.score(x_test, y_test)

0.7038327526132404

In [96]:
y_pred = clf3.predict(x_test)

In [97]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

                    precision    recall  f1-score   support

          AskIndia       0.54      0.80      0.65        61
  Business/Finance       0.66      0.55      0.60        60
       Coronavirus       0.77      0.89      0.82        45
              Food       0.58      0.88      0.70        50
     Non-Political       0.98      0.89      0.93        53
       Photography       0.93      0.70      0.80        53
    Policy/Economy       0.68      0.46      0.55        50
          Politics       0.59      0.67      0.62        51
         Scheduled       0.56      0.57      0.56        44
Science/Technology       0.72      0.54      0.62        39
            Sports       0.98      0.75      0.85        68

          accuracy                           0.70       574
         macro avg       0.73      0.70      0.70       574
      weighted avg       0.73      0.70      0.71       574

[[49  2  1  3  0  0  0  2  4  0  0]
 [ 5 33  3  4  1  0  4  4  5  1  0]
 [ 0  0 40  5  0  0  0  0

### As we can see from the above results and the confusion matrix, the classifier works well for classification of flairs such as Sports and Non-Political.
### However, in other categories, it gets confused due to a great amount of similarity between most of the flairs due to ongoing Covid19 crisis, because of which most of the other flairs have similar content. 

In [99]:
clf4 = SVC()
clf4.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [100]:
clf4.score(x_test, y_test)

0.6411149825783972