In [46]:
import pandas as pd
import numpy as np
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.feature_selection import SelectFromModel

In [2]:
data = []
with open('../hw-1-parser/output/rambler-news.json') as file:
    for line in file:
        data.append(json.loads(line))

data = pd.DataFrame(data)

In [3]:
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(data['text'])
y = data['category']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

In [5]:
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(classification_report(y_test, predictions, target_names=np.unique(y)))

             precision    recall  f1-score   support

  economics       0.71      0.68      0.70       119
  incidents       0.89      0.92      0.90       120
   politics       0.81      0.87      0.84       126
     realty       0.78      0.73      0.75       128

avg / total       0.80      0.80      0.80       493



In [20]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(classification_report(y_test, predictions, target_names=np.unique(y)))

             precision    recall  f1-score   support

  economics       0.69      0.71      0.70       119
  incidents       0.93      0.93      0.93       120
   politics       0.83      0.86      0.84       126
     realty       0.78      0.73      0.75       128

avg / total       0.81      0.81      0.81       493



In [7]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(classification_report(y_test, predictions, target_names=np.unique(y)))

             precision    recall  f1-score   support

  economics       0.53      0.61      0.57       119
  incidents       0.84      0.88      0.86       120
   politics       0.78      0.77      0.78       126
     realty       0.70      0.58      0.63       128

avg / total       0.71      0.71      0.71       493



In [8]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(classification_report(y_test, predictions, target_names=np.unique(y)))

             precision    recall  f1-score   support

  economics       0.55      0.67      0.61       119
  incidents       0.92      0.82      0.87       120
   politics       0.82      0.81      0.82       126
     realty       0.76      0.69      0.72       128

avg / total       0.76      0.75      0.75       493



In [72]:
voting = VotingClassifier(
    estimators = [('lr', LogisticRegression()), 
                  ('rf', RandomForestClassifier()),
                  ('gb',GradientBoostingClassifier())],
    voting='hard', n_jobs=-1)

voting.fit(X_train, y_train)
predictions = voting.predict(X_test)
print(classification_report(y_test, predictions, target_names=np.unique(y)))

             precision    recall  f1-score   support

  economics       0.64      0.67      0.66       119
  incidents       0.88      0.91      0.89       120
   politics       0.83      0.83      0.83       126
     realty       0.76      0.70      0.73       128

avg / total       0.78      0.78      0.78       493



  if diff:


In [73]:
voting = VotingClassifier(
    estimators = [('lr', LogisticRegression()), 
                  ('rf', RandomForestClassifier()),
                  ('gb',GradientBoostingClassifier())],
    voting='soft', n_jobs=-1)

voting.fit(X_train, y_train)
predictions = voting.predict(X_test)
print(classification_report(y_test, predictions, target_names=np.unique(y)))

             precision    recall  f1-score   support

  economics       0.63      0.64      0.64       119
  incidents       0.90      0.90      0.90       120
   politics       0.80      0.83      0.81       126
     realty       0.77      0.73      0.75       128

avg / total       0.77      0.77      0.77       493



  if diff:


In [77]:
rf = RandomForestClassifier(n_estimators=100).fit(X_train,y_train)
model = SelectFromModel(rf, prefit=True)
X_train_new = model.transform(X_train)
X_test_new = model.transform(X_test)

In [78]:
model = LogisticRegression()
model.fit(X_train_new, y_train)
predictions = model.predict(X_test_new)
print(classification_report(y_test, predictions, target_names=np.unique(y)))

model = SVC(kernel='linear')
model.fit(X_train_new, y_train)
predictions = model.predict(X_test_new)
print(classification_report(y_test, predictions, target_names=np.unique(y)))

model = GradientBoostingClassifier()
model.fit(X_train_new, y_train)
predictions = model.predict(X_test_new)
print(classification_report(y_test, predictions, target_names=np.unique(y)))

             precision    recall  f1-score   support

  economics       0.74      0.68      0.71       119
  incidents       0.87      0.91      0.89       120
   politics       0.78      0.87      0.82       126
     realty       0.79      0.72      0.75       128

avg / total       0.79      0.80      0.79       493

             precision    recall  f1-score   support

  economics       0.70      0.71      0.71       119
  incidents       0.91      0.92      0.91       120
   politics       0.81      0.87      0.84       126
     realty       0.81      0.73      0.77       128

avg / total       0.81      0.81      0.81       493

             precision    recall  f1-score   support

  economics       0.53      0.66      0.58       119
  incidents       0.92      0.82      0.86       120
   politics       0.80      0.79      0.80       126
     realty       0.74      0.66      0.70       128

avg / total       0.75      0.73      0.74       493

