## Importing the dependencies

In [1]:
# Importing the dependencies
import pandas as pd
from joblib import dump
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
# Reading the cleaned extracted posts data
posts = pd.read_csv('../Data/cleaned_scrapedData.csv')
allFlairs=['AMA', 'AskIndia', 'Business/Finance', 'Coronavirus', 'Food', 'Non-Political', 'Photography', 'Policy/Economy', 'Politics', 'Scheduled', 'Science/Technology', 'Sports', '[R]eddiquette']

## Defining some models

In [3]:
# Multinomial Naive Bayes Model
def modelTrainMNB(train_X, train_Y, test_X, test_Y):
    NBClassifier = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('nb',MultinomialNB())])
    NBClassifier.fit(train_X, train_Y)
    predicted_Y = NBClassifier.predict(test_X)
    print(classification_report(test_Y, predicted_Y,target_names=allFlairs))
    print("Accuracy : ", accuracy_score(test_Y, predicted_Y))

In [4]:
# Multi-layer Perceptron Model
def modelTrainMLP(train_X, train_Y, test_X, test_Y):
    MLP = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('mlp',MLPClassifier(hidden_layer_sizes=(30,30,30), activation='relu'))])
    MLP.fit(train_X, train_Y)
    predicted_Y = MLP.predict(test_X)
    print(classification_report(test_Y, predicted_Y,target_names=allFlairs))
    print("Accuracy : ", accuracy_score(test_Y, predicted_Y))

In [5]:
# Random Forest Model
def modelTrainRNF(train_X, train_Y, test_X, test_Y):
    RNF = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('rnf',RandomForestClassifier(n_estimators=1000, random_state=42))])
    RNF.fit(train_X, train_Y)
    predicted_Y = RNF.predict(test_X)
    print(classification_report(test_Y, predicted_Y,target_names=allFlairs))
    print("Accuracy : ", accuracy_score(test_Y, predicted_Y))

In [6]:
# Logistic Regression Model
def modelTrainLR(train_X, train_Y, test_X, test_Y):
    LR = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('lr', LogisticRegression(penalty='l2',random_state=10, solver='lbfgs', multi_class='multinomial'))])
    LR.fit(train_X, train_Y)
    predicted_Y = LR.predict(test_X)
    print(classification_report(test_Y, predicted_Y,target_names=allFlairs))
    print("Accuracy : ", accuracy_score(test_Y, predicted_Y))

In [7]:
# Linear SVM Model
def modelTrainLSVM(train_X, train_Y, test_X, test_Y):
    LR = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('linear_svc', LinearSVC(penalty='l2',C=1.0))])
    LR.fit(train_X, train_Y)
    predicted_Y = LR.predict(test_X)
    print(classification_report(test_Y, predicted_Y,target_names=allFlairs))
    print("Accuracy : ", accuracy_score(test_Y, predicted_Y))

In [8]:
# Function to convert into string datatype
def makeString(text):
    return str(text)

In [9]:
# Converting relevant attribute values into string datatype
posts['author'] = posts['author'].apply(makeString)
posts['body'] = posts['body'].apply(makeString)
posts['comments'] = posts['comments'].apply(makeString)
posts['flair'] = posts['flair'].apply(makeString)
posts['title'] = posts['title'].apply(makeString)

In [10]:
# making the test data for the combination url-author-title-body-comments
train={}
train["url-author-title-body-comments"]=[]
flairsList=[]
for i in range(len(posts['url'])):
    flairsList.append(posts['flair'][i])
    train["url-author-title-body-comments"].append(posts['url'][i] +posts['author'][i] + ' ' + posts['title'][i] + ' ' + posts['body'][i]+' '+posts['comments'][i])

## Training the data on every model and checking for the best

In [11]:
# segregating test and train data
train_X, test_X, train_Y, test_Y = train_test_split(train["url-author-title-body-comments"], flairsList, test_size=0.1, random_state=42)
print("features: url-author-title-body-comments")

# training on each model defined above
print("\n\n Multinomial Naive Bayes")
modelTrainMNB(train_X, train_Y, test_X, test_Y)

print("\n\n Multi-layer Perceptron")
modelTrainMLP(train_X, train_Y, test_X, test_Y)

print("\n\n Random Forest")
modelTrainRNF(train_X, train_Y, test_X, test_Y)

print("\n\n Logistic Regression")
modelTrainLR(train_X, train_Y, test_X, test_Y)

print("\n\n Linear Support Vector Machine")
modelTrainLSVM(train_X, train_Y, test_X, test_Y)

features: url-author-title-body-comments


 Multinomial Naive Bayes


  _warn_prf(average, modifier, msg_start, len(result))


                    precision    recall  f1-score   support

               AMA       0.40      1.00      0.58        17
          AskIndia       0.30      0.39      0.34        18
  Business/Finance       0.64      0.39      0.48        18
       Coronavirus       0.41      1.00      0.58        21
              Food       0.82      0.53      0.64        17
     Non-Political       0.00      0.00      0.00        21
       Photography       0.95      0.71      0.82        28
    Policy/Economy       0.68      0.65      0.67        20
          Politics       0.60      0.52      0.56        23
         Scheduled       0.62      1.00      0.77        18
Science/Technology       0.67      0.08      0.15        24
            Sports       0.91      0.56      0.69        18
     [R]eddiquette       0.42      0.47      0.44        17

          accuracy                           0.55       260
         macro avg       0.57      0.56      0.52       260
      weighted avg       0.58      0.5

## Dumping the best model

In [13]:
train_X, test_X, train_Y, test_Y = train_test_split(train["url-author-title-body-comments"], flairsList, test_size=0.1, random_state=42)
# making the final model
finalModel_LR = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('lr', LogisticRegression(penalty='l2',random_state=10, solver='lbfgs', multi_class='multinomial'))])
finalModel_LR.fit(train_X, train_Y)
#dumping the final model
dump(finalModel_LR, '../Model/finalModel.joblib')

['../Model/finalModel.joblib']