##Install Required Modules

In [0]:
# !pip install scikit-learn
# !pip install pickle
# !pip install pandas 
# !pip install bs4
# !pip install re
# !pip install nltk

##Import Libraires and Required Dependencies

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
import pickle
import pandas as pd
import pickle
from bs4 import BeautifulSoup
import re
import nltk
nltk.download('all')
from nltk.corpus import stopwords

##Loading the Scrapped Data for training the models
We first load the data but we have to clean it again to process empty spaces in the dataset such as some posts have no body, some posts have no comments, else there are errors while training  

In [0]:
flairs = ["AskIndia", "Non-Political", "Scheduled", "Photography", "Science/Technology", "Politics", "Business/Finance", "Policy/Economy", "Sports", "Food", "Coronavirus"]

data = pd.read_csv('/content/data1.csv')
data.head()

replace_by_space = re.compile('[/(){}\[\]\|@,;]')
replace_symbol = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = replace_by_space.sub(' ', text) # replace certain symbols by space in text
    text = replace_symbol.sub('', text) # delete symbols from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove STOPWORDS from text
    return text

def to_str(text):
  return str(text)

data['title'] = data['title'].apply(to_str)
data['title'] = data['title'].apply(clean_text)
data['url'] = data['url'].apply(to_str)
data['url'] = data['url'].apply(clean_text)
data['body'] = data['body'].apply(to_str)
data['body'] = data['body'].apply(clean_text)
data['comments'] = data['comments'].apply(to_str)
data['comments'] = data['comments'].apply(clean_text)
data['combined_features'] = data['combined_features'].apply(to_str)
data['combined_features'] = data['combined_features'].apply(clean_text)

##Naive Bayes Classifier
Naive Bayes is a family of algorithms based on applying Bayes theorem with a strong(naive) assumption, that every feature is independent of the others, in order to predict the category of a given sample. They are probabilistic classifiers, therefore will calculate the probability of each category using Bayes theorem, and the category with the highest probability will be output.

In [0]:
def nb_classifier(X_train, X_test, y_train, y_test):
  
  from sklearn.naive_bayes import MultinomialNB


  nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB()),
                ])
  nb.fit(X_train, y_train)

  y_pred = nb.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

##Linear Support Vector Machine
The objective of the support vector machine algorithm is to find a hyperplane in an N-dimensional space(N — the number of features) that distinctly classifies the data points.
Our objective is to find a plane that has the maximum margin, i.e the maximum distance between data points of both classes. 

In [0]:
def linear_svm(X_train, X_test, y_train, y_test):
  
  from sklearn.linear_model import SGDClassifier

  sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=10, tol=None)),
                 ])
  sgd.fit(X_train, y_train)

  y_pred = sgd.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

##Logistic Regression
Logistic Regression is a core supervised learning technique for solving classification problems. 

In [0]:
def logisticreg(X_train, X_test, y_train, y_test):

  from sklearn.linear_model import LogisticRegression

  logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e5, max_iter=100)),
                 ])
  logreg.fit(X_train, y_train)

  y_pred = logreg.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

##Random Forest
Random forest, like its name implies, consists of a large number of individual decision trees that operate as an ensemble. Each individual tree in the random forest spits out a class prediction and the class with the most votes becomes our model’s prediction

In [0]:
def randomforest(X_train, X_test, y_train, y_test):
  
  from sklearn.ensemble import RandomForestClassifier
  
  ranfor = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                 ])
  ranfor.fit(X_train, y_train)

  y_pred = ranfor.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

##MLP Classifier
Multi-layer Perceptron classifier.
This model optimizes the log-loss function using stochastic gradient descent.

In [0]:
def mlpclassifier(X_train, X_test, y_train, y_test):
  
  from sklearn.neural_network import MLPClassifier
  
  mlp = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30))),
                 ])
  mlp.fit(X_train, y_train)

  y_pred = mlp.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

##Training Loop 
Train all 5 models with differentfeature

In [0]:
def train_test(X,y):
 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
   
    print("Results of Naive Bayes Classifier")
    nb_classifier(X_train, X_test, y_train, y_test)
    print("Results of Linear Support Vector Machine")
    linear_svm(X_train, X_test, y_train, y_test)
    print("Results of Logistic Regression")
    logisticreg(X_train, X_test, y_train, y_test)
    print("Results of Random Forest")
    randomforest(X_train, X_test, y_train, y_test)
    print("Results of MLP Classifier")
    mlpclassifier(X_train, X_test, y_train, y_test)

##Dividing data into diiferent features to train on 

In [0]:
cat = data.flair

X = data.title
Y = data.body
Z = data.url
V = data.combined_features
W = data.comments

##Flair Detection using Title as Feature

In [27]:
train_test(X,cat)

Results of Naive Bayes Classifier
accuracy 0.6545454545454545
                    precision    recall  f1-score   support

          AskIndia       0.71      0.50      0.59        24
     Non-Political       0.27      0.43      0.33        14
         Scheduled       0.60      0.94      0.73        16
       Photography       0.73      0.84      0.78        19
Science/Technology       0.67      0.84      0.74        19
          Politics       0.88      0.84      0.86        25
  Business/Finance       0.59      0.45      0.51        22
    Policy/Economy       0.76      0.52      0.62        25
            Sports       0.59      0.59      0.59        17
              Food       0.57      0.67      0.62        18
       Coronavirus       0.93      0.62      0.74        21

          accuracy                           0.65       220
         macro avg       0.66      0.66      0.65       220
      weighted avg       0.68      0.65      0.66       220

Results of Linear Support Vector Ma

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


accuracy 0.7363636363636363
                    precision    recall  f1-score   support

          AskIndia       0.90      0.79      0.84        24
     Non-Political       0.24      0.43      0.31        14
         Scheduled       0.94      0.94      0.94        16
       Photography       0.94      0.84      0.89        19
Science/Technology       0.94      0.89      0.92        19
          Politics       0.95      0.84      0.89        25
  Business/Finance       0.46      0.50      0.48        22
    Policy/Economy       0.78      0.72      0.75        25
            Sports       0.62      0.76      0.68        17
              Food       0.67      0.67      0.67        18
       Coronavirus       0.93      0.67      0.78        21

          accuracy                           0.74       220
         macro avg       0.76      0.73      0.74       220
      weighted avg       0.78      0.74      0.75       220

Results of Random Forest
accuracy 0.7090909090909091
                

##Flair Detection using Body as Feature

In [28]:
train_test(Y,cat)

Results of Naive Bayes Classifier
accuracy 0.24545454545454545
                    precision    recall  f1-score   support

          AskIndia       0.44      0.29      0.35        24
     Non-Political       0.13      0.57      0.21        14
         Scheduled       0.14      0.88      0.24        16
       Photography       0.00      0.00      0.00        19
Science/Technology       0.00      0.00      0.00        19
          Politics       0.85      0.44      0.58        25
  Business/Finance       0.40      0.36      0.38        22
    Policy/Economy       0.00      0.00      0.00        25
            Sports       0.50      0.18      0.26        17
              Food       0.50      0.06      0.10        18
       Coronavirus       1.00      0.10      0.17        21

          accuracy                           0.25       220
         macro avg       0.36      0.26      0.21       220
      weighted avg       0.38      0.25      0.22       220

Results of Linear Support Vector M

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.38181818181818183
                    precision    recall  f1-score   support

          AskIndia       0.45      0.58      0.51        24
     Non-Political       0.50      0.57      0.53        14
         Scheduled       0.00      0.00      0.00        16
       Photography       0.14      0.79      0.24        19
Science/Technology       0.33      0.05      0.09        19
          Politics       1.00      0.52      0.68        25
  Business/Finance       0.65      0.50      0.56        22
    Policy/Economy       0.00      0.00      0.00        25
            Sports       0.62      0.47      0.53        17
              Food       0.54      0.39      0.45        18
       Coronavirus       0.78      0.33      0.47        21

          accuracy                           0.38       220
         macro avg       0.46      0.38      0.37       220
      weighted avg       0.47      0.38      0.38       220

Results of Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.35909090909090907
                    precision    recall  f1-score   support

          AskIndia       0.53      0.38      0.44        24
     Non-Political       0.47      0.50      0.48        14
         Scheduled       0.14      0.88      0.24        16
       Photography       0.50      0.05      0.10        19
Science/Technology       0.13      0.11      0.12        19
          Politics       0.88      0.56      0.68        25
  Business/Finance       0.52      0.50      0.51        22
    Policy/Economy       0.00      0.00      0.00        25
            Sports       0.50      0.47      0.48        17
              Food       0.54      0.39      0.45        18
       Coronavirus       1.00      0.29      0.44        21

          accuracy                           0.36       220
         macro avg       0.47      0.37      0.36       220
      weighted avg       0.48      0.36      0.36       220

Results of Random Forest
accuracy 0.35
                    precision

  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.2636363636363636
                    precision    recall  f1-score   support

          AskIndia       0.40      0.08      0.14        24
     Non-Political       0.20      0.36      0.26        14
         Scheduled       0.14      0.88      0.24        16
       Photography       0.50      0.05      0.10        19
Science/Technology       0.06      0.05      0.06        19
          Politics       0.92      0.44      0.59        25
  Business/Finance       0.62      0.36      0.46        22
    Policy/Economy       0.00      0.00      0.00        25
            Sports       0.41      0.41      0.41        17
              Food       0.19      0.28      0.23        18
       Coronavirus       0.80      0.19      0.31        21

          accuracy                           0.26       220
         macro avg       0.39      0.28      0.25       220
      weighted avg       0.40      0.26      0.26       220



  _warn_prf(average, modifier, msg_start, len(result))


##Flair Detection using URL as Feature

In [29]:
train_test(Z,cat)

Results of Naive Bayes Classifier
accuracy 0.2590909090909091
                    precision    recall  f1-score   support

          AskIndia       0.20      1.00      0.34        24
     Non-Political       0.25      0.07      0.11        14
         Scheduled       0.30      0.44      0.36        16
       Photography       0.10      0.05      0.07        19
Science/Technology       0.40      0.21      0.28        19
          Politics       0.45      0.20      0.28        25
  Business/Finance       0.20      0.05      0.07        22
    Policy/Economy       0.42      0.20      0.27        25
            Sports       0.21      0.18      0.19        17
              Food       0.36      0.22      0.28        18
       Coronavirus       1.00      0.10      0.17        21

          accuracy                           0.26       220
         macro avg       0.36      0.25      0.22       220
      weighted avg       0.36      0.26      0.22       220

Results of Linear Support Vector Ma

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


accuracy 0.2772727272727273
                    precision    recall  f1-score   support

          AskIndia       0.20      1.00      0.34        24
     Non-Political       0.38      0.21      0.27        14
         Scheduled       0.38      0.50      0.43        16
       Photography       0.09      0.05      0.07        19
Science/Technology       0.40      0.11      0.17        19
          Politics       0.38      0.24      0.29        25
  Business/Finance       0.25      0.09      0.13        22
    Policy/Economy       0.46      0.24      0.32        25
            Sports       0.25      0.12      0.16        17
              Food       0.50      0.22      0.31        18
       Coronavirus       1.00      0.14      0.25        21

          accuracy                           0.28       220
         macro avg       0.39      0.27      0.25       220
      weighted avg       0.39      0.28      0.25       220

Results of Random Forest
accuracy 0.16818181818181818
               

##Flair Detection using Comments as Feature

In [30]:
train_test(W,cat)

Results of Naive Bayes Classifier
accuracy 0.36363636363636365
                    precision    recall  f1-score   support

          AskIndia       0.17      0.04      0.07        24
     Non-Political       0.22      0.93      0.35        14
         Scheduled       0.24      0.56      0.34        16
       Photography       0.45      0.53      0.49        19
Science/Technology       0.00      0.00      0.00        19
          Politics       0.88      0.56      0.68        25
  Business/Finance       0.32      0.32      0.32        22
    Policy/Economy       0.42      0.32      0.36        25
            Sports       0.23      0.18      0.20        17
              Food       0.36      0.22      0.28        18
       Coronavirus       1.00      0.52      0.69        21

          accuracy                           0.36       220
         macro avg       0.39      0.38      0.34       220
      weighted avg       0.41      0.36      0.35       220

Results of Linear Support Vector M

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


accuracy 0.39090909090909093
                    precision    recall  f1-score   support

          AskIndia       0.20      0.08      0.12        24
     Non-Political       0.34      0.79      0.48        14
         Scheduled       0.24      0.62      0.34        16
       Photography       0.53      0.47      0.50        19
Science/Technology       0.10      0.11      0.10        19
          Politics       0.79      0.60      0.68        25
  Business/Finance       0.50      0.18      0.27        22
    Policy/Economy       0.44      0.48      0.46        25
            Sports       0.31      0.29      0.30        17
              Food       0.38      0.28      0.32        18
       Coronavirus       0.73      0.52      0.61        21

          accuracy                           0.39       220
         macro avg       0.42      0.40      0.38       220
      weighted avg       0.43      0.39      0.38       220

Results of Random Forest
accuracy 0.38636363636363635
              

##Flair Detection using Combined Features as Feature

In [31]:
train_test(V,cat)

Results of Naive Bayes Classifier
accuracy 0.5363636363636364
                    precision    recall  f1-score   support

          AskIndia       0.33      0.08      0.13        24
     Non-Political       0.21      0.93      0.34        14
         Scheduled       0.83      0.62      0.71        16
       Photography       0.61      0.58      0.59        19
Science/Technology       0.50      0.16      0.24        19
          Politics       0.86      0.76      0.81        25
  Business/Finance       0.42      0.77      0.55        22
    Policy/Economy       0.80      0.32      0.46        25
            Sports       0.67      0.35      0.46        17
              Food       0.72      0.72      0.72        18
       Coronavirus       1.00      0.76      0.86        21

          accuracy                           0.54       220
         macro avg       0.63      0.55      0.53       220
      weighted avg       0.64      0.54      0.54       220

Results of Linear Support Vector Ma

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


accuracy 0.7181818181818181
                    precision    recall  f1-score   support

          AskIndia       0.68      0.54      0.60        24
     Non-Political       0.62      0.93      0.74        14
         Scheduled       0.78      0.88      0.82        16
       Photography       0.68      0.68      0.68        19
Science/Technology       0.59      0.53      0.56        19
          Politics       0.87      0.80      0.83        25
  Business/Finance       0.68      0.59      0.63        22
    Policy/Economy       0.64      0.72      0.68        25
            Sports       0.56      0.59      0.57        17
              Food       0.79      0.83      0.81        18
       Coronavirus       1.00      0.90      0.95        21

          accuracy                           0.72       220
         macro avg       0.72      0.73      0.72       220
      weighted avg       0.72      0.72      0.72       220

Results of Random Forest
accuracy 0.7863636363636364
                

##Downloading Logistic Regression and Random Forrest model
training them on combined features as these models have shown the best results on these features using pickle

In [32]:
data = pd.read_csv('/content/data1.csv')
data.head()

data['title'] = data['title'].apply(to_str)
data['title'] = data['title'].apply(clean_text)
data['body'] = data['body'].apply(to_str)
data['body'] = data['body'].apply(clean_text)
data['comments'] = data['comments'].apply(to_str)
data['comments'] = data['comments'].apply(clean_text)
data['combined_features'] = data['combined_features'].apply(to_str)
data['combined_features'] = data['combined_features'].apply(clean_text)

cat = data.flair

V = data.combined_features
W = data.comments
X = data.title
Y = data.body
Z = data.url

X_train, X_test, y_train, y_test = train_test_split(V, cat, test_size=0.2, random_state = 42)
ranfor = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                  ])
RM = ranfor.fit(X_train, y_train)
pickle.dump(RM,open("RF_data4.pkl",'wb'))
y_pred = ranfor.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))

logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e5, max_iter=200)),
                 ])

logreg = logreg.fit(X_train, y_train)

pickle.dump(logreg,open("LR_data4.pkl",'wb'))
y_pred = logreg.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))



accuracy 0.7863636363636364
accuracy 0.7136363636363636


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
