In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn.metrics import classification_report,confusion_matrix
from joblib import dump, load

In [2]:
df = pd.read_csv('reddit-india-data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1118 entries, 0 to 1117
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   flair      1118 non-null   object
 1   title      1118 non-null   object
 2   score      1118 non-null   int64 
 3   id         1118 non-null   object
 4   url        1118 non-null   object
 5   comms_num  1118 non-null   int64 
 6   body       677 non-null    object
 7   author     1118 non-null   object
 8   comments   1019 non-null   object
 9   timestamp  1118 non-null   object
dtypes: int64(2), object(8)
memory usage: 87.5+ KB


In [3]:
df['comments']=df['comments'].fillna("")
df['body']=df['body'].fillna("")
df['combined'] = df['title'] + df['body'] + df['comments']

In [None]:
I use only title as features and combined features. <br>
I use logistic regression, linear svc and randon forest as classifiers. <br>


In [4]:
X_train, X_test, y_train, y_test = train_test_split(df['title'],df['flair'],test_size = 0.2 ,)

pipeline = Pipeline([
        ('bow', CountVectorizer(  ngram_range=(1,2), stop_words= 'english')),  
        ('tfidf', TfidfTransformer()),  
        ('c',LogisticRegression())
    ])

clf1 = pipeline.fit(X_train,y_train)

pred=pipeline.predict(X_test)

print("Classification results using Logistic Regression\n")
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))

Classification results using Logistic Regression

[[19  0  1  0  0  0  0  1  1  0  0  0]
 [ 0  7  3  1  1  0  2  1  2  0  0  0]
 [ 0  4 10  0  0  0  5  0  3  1  1  0]
 [ 0  0  0 13  0  1  0  1  0  0  0  0]
 [ 1  0  0  0 21  0  0  1  0  0  0  0]
 [ 0  0  0  0  0 22  0  0  0  0  0  0]
 [ 0  3  5  0  0  0 11  2  2  1  0  0]
 [ 0  0  0  0  1  1  2  8  3  0  1  0]
 [ 1  3  2  0  0  0  1  1  9  1  0  0]
 [ 0  2  3  1  0  0  0  1  1 10  1  0]
 [ 0  2  0  0  0  0  1  0  2  0 15  0]
 [ 0  0  0  0  0  0  0  2  2  0  0  0]]
                    precision    recall  f1-score   support

               AMA       0.90      0.86      0.88        22
          AskIndia       0.33      0.41      0.37        17
  Business/Finance       0.42      0.42      0.42        24
              Food       0.87      0.87      0.87        15
     Non-Political       0.91      0.91      0.91        23
       Photography       0.92      1.00      0.96        22
    Policy/Economy       0.50      0.46      0.48        24


  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
pipeline = Pipeline([
        ('bow', CountVectorizer(  ngram_range=(1,2), stop_words= 'english')),  
        ('tfidf', TfidfTransformer()),  
        ('c',LinearSVC())
    ])

clf2 = pipeline.fit(X_train,y_train)
pred=pipeline.predict(X_test)

print("Classification results using LinearSVC\n")
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))

Classification results using LinearSVC

[[20  1  1  0  0  1  2  0  0  0  0  0]
 [ 1 10  0  3  1  0  0  0  0  1  1  0]
 [ 0  0 14  0  0  0  5  1  0  2  0  0]
 [ 1  1  0 18  0  1  0  0  1  0  0  0]
 [ 1  2  1  1 10  0  0  3  0  2  0  0]
 [ 0  0  0  0  2 16  0  0  0  0  0  0]
 [ 0  0  2  0  0  0 13  1  0  0  0  0]
 [ 1  0  0  1  0  1  2 12  2  1  0  0]
 [ 1  1  0  1  0  0  0  1 13  0  0  0]
 [ 0  1  0  1  0  0  1  1  0 18  0  0]
 [ 0  0  0  1  0  0  2  1  0  0 18  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  2]]
                    precision    recall  f1-score   support

               AMA       0.77      0.80      0.78        25
          AskIndia       0.62      0.59      0.61        17
  Business/Finance       0.78      0.64      0.70        22
              Food       0.69      0.82      0.75        22
     Non-Political       0.77      0.50      0.61        20
       Photography       0.84      0.89      0.86        18
    Policy/Economy       0.52      0.81      0.63        16
          

In [9]:
pipeline = Pipeline([
        ('bow', CountVectorizer(  ngram_range=(1,2), stop_words= 'english')),  
        ('tfidf', TfidfTransformer()),  
        ('c',RandomForestClassifier())
    ])

clf3 = pipeline.fit(X_train,y_train)
pred=pipeline.predict(X_test)

print("Classification results using Random Forest")
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))


Classification results using Random Forest
[[18  1  0  1  1  0  1  2  1  0  0  0]
 [ 0  9  0  3  1  2  0  0  0  1  1  0]
 [ 0  1 13  0  0  1  6  0  0  1  0  0]
 [ 1  1  0 18  0  1  0  0  1  0  0  0]
 [ 0  1  1  1 14  0  0  2  0  1  0  0]
 [ 0  0  0  0  0 18  0  0  0  0  0  0]
 [ 0  0  2  1  0  0 11  1  1  0  0  0]
 [ 2  0  0  0  2  0  2 11  3  0  0  0]
 [ 0  1  0  0  0  0  0  1 15  0  0  0]
 [ 0  0  2  0  0  0  0  0  0 19  1  0]
 [ 0  2  0  0  1  1  1  1  1  0 15  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  2]]
                    precision    recall  f1-score   support

               AMA       0.82      0.72      0.77        25
          AskIndia       0.56      0.53      0.55        17
  Business/Finance       0.72      0.59      0.65        22
              Food       0.75      0.82      0.78        22
     Non-Political       0.74      0.70      0.72        20
       Photography       0.78      1.00      0.88        18
    Policy/Economy       0.52      0.69      0.59        16
       

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['combined'],df['flair'],test_size = 0.2 ,)

pipeline = Pipeline([
        ('bow', CountVectorizer(  ngram_range=(1,2), stop_words= 'english')),  
        ('tfidf', TfidfTransformer()),  
        ('c',LogisticRegression())
    ])

clf4 = pipeline.fit(X_train,y_train)

pred=pipeline.predict(X_test)

print("Classification results using Logistic Regression\n")
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))

Classification results using Logistic Regression

[[17  0  1  0  0  0  2  0  0  0  0  0]
 [ 2  8  4  1  0  0  0  1  1  2  0  0]
 [ 0  0 12  0  0  0  4  0  1  1  0  0]
 [ 0  0  0 19  0  0  0  1  0  0  0  0]
 [ 3  0  2  2  5  0  0  1  1  1  1  0]
 [ 0  0  1  0  0 25  0  0  0  1  0  0]
 [ 2  1  1  0  0  0 17  1  0  0  0  0]
 [ 2  0  0  4  1  0  0 16  2  0  0  0]
 [ 5  0  1  0  0  0  0  0 12  0  0  0]
 [ 2  1  1  0  0  0  0  0  0 16  0  0]
 [ 2  1  0  0  0  0  0  0  0  0 13  0]
 [ 2  0  0  0  0  0  0  0  0  0  0  1]]
                    precision    recall  f1-score   support

               AMA       0.46      0.85      0.60        20
          AskIndia       0.73      0.42      0.53        19
  Business/Finance       0.52      0.67      0.59        18
              Food       0.73      0.95      0.83        20
     Non-Political       0.83      0.31      0.45        16
       Photography       1.00      0.93      0.96        27
    Policy/Economy       0.74      0.77      0.76        22


In [11]:
pipeline = Pipeline([
        ('bow', CountVectorizer(  ngram_range=(1,2), stop_words= 'english')),  
        ('tfidf', TfidfTransformer()),  
        ('c',LinearSVC())
    ])

clf5 = pipeline.fit(X_train,y_train)
pred=pipeline.predict(X_test)

print("Classification results using LinearSVC\n")
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))

Classification results using LinearSVC

[[17  0  1  0  0  0  2  0  0  0  0  0]
 [ 1  7  3  1  0  0  1  1  2  3  0  0]
 [ 0  0 12  1  0  0  3  0  1  1  0  0]
 [ 0  0  0 19  0  0  0  1  0  0  0  0]
 [ 2  1  2  2  5  0  0  1  1  1  1  0]
 [ 0  0  1  0  0 25  0  0  0  1  0  0]
 [ 0  2  1  0  0  0 17  2  0  0  0  0]
 [ 1  0  0  5  0  0  0 18  1  0  0  0]
 [ 3  0  1  0  0  0  0  0 14  0  0  0]
 [ 2  0  1  0  0  0  0  1  0 16  0  0]
 [ 2  0  0  0  0  0  0  0  0  0 14  0]
 [ 2  0  0  0  0  0  0  0  0  0  0  1]]
                    precision    recall  f1-score   support

               AMA       0.57      0.85      0.68        20
          AskIndia       0.70      0.37      0.48        19
  Business/Finance       0.55      0.67      0.60        18
              Food       0.68      0.95      0.79        20
     Non-Political       1.00      0.31      0.48        16
       Photography       1.00      0.93      0.96        27
    Policy/Economy       0.74      0.77      0.76        22
          

In [12]:
pipeline = Pipeline([
        ('bow', CountVectorizer(  ngram_range=(1,2), stop_words= 'english')),  
        ('tfidf', TfidfTransformer()),  
        ('c',RandomForestClassifier())
    ])

clf6 = pipeline.fit(X_train,y_train)
pred=pipeline.predict(X_test)

print("Classification results using Random Forest")
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))


Classification results using Random Forest
[[14  0  0  2  1  0  1  1  0  0  1  0]
 [ 2 13  0  1  1  0  0  1  0  0  1  0]
 [ 2  1  8  0  0  0  6  0  0  1  0  0]
 [ 1  0  0 18  0  0  0  1  0  0  0  0]
 [ 1  0  1  1 11  0  0  1  0  1  0  0]
 [ 2  1  0  0  0 24  0  0  0  0  0  0]
 [ 1  1  2  0  0  0 17  1  0  0  0  0]
 [ 0  0  0  4  2  0  0 18  1  0  0  0]
 [ 2  0  0  1  0  0  1  0 13  0  1  0]
 [ 2  0  0  0  0  0  1  0  0 17  0  0]
 [ 1  2  0  0  0  0  0  0  0  0 13  0]
 [ 1  0  1  0  0  0  0  0  0  0  0  1]]
                    precision    recall  f1-score   support

               AMA       0.48      0.70      0.57        20
          AskIndia       0.72      0.68      0.70        19
  Business/Finance       0.67      0.44      0.53        18
              Food       0.67      0.90      0.77        20
     Non-Political       0.73      0.69      0.71        16
       Photography       1.00      0.89      0.94        27
    Policy/Economy       0.65      0.77      0.71        22
       

In [13]:
dump(clf6, 'flair_predict.joblib') 

['flair_predict.joblib']