In [1]:
import pickle
import time
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as nm


In [3]:
###############################################################################
#  Load the raw text dataset.
###############################################################################

print("Loading dataset...")

# The raw text dataset is stored as tuple in the form:
# (X_train_raw, y_train_raw, X_test_raw, y_test)
# The 'filtered' dataset excludes any articles that we failed to retrieve
# fingerprints for.
'''categories = ['comp.graphics', 'sci.space','rec.sport.baseball','sci.electronics']
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',categories=categories)

X_train_raw = newsgroups_train.data
#y_train_labels = newsgroups_train.target
X_test_raw = newsgroups_test.data
#y_test_labels = newsgroups_test.target

print(len(X_train_raw))
y_train = newsgroups_train.target
y_test = newsgroups_test.target'''

categories = ['action_adventure','drama_romance','comedy_family','horror_thriller']
tmdbmovies_train = pd.read_csv("total_train.csv")
tmdbmovies_test = pd.read_csv("total_test.csv")
X_train_raw = tmdbmovies_train["data"]
y_train = tmdbmovies_train["Label"]
X_test_raw = tmdbmovies_test["data"]
y_test = tmdbmovies_test["Label"]
print(len(X_train_raw),len(y_train))
print(len(X_test_raw),len(y_test))

vectorizer = TfidfVectorizer()
lda = LatentDirichletAllocation()
# Build the tfidf vectorizer from the training data ("fit"), and apply it 
# ("transform").
vectorizer.fit(X_train_raw)
X_train_tfidf = vectorizer.transform(X_train_raw).toarray()
print("done")
#print("  Actual number of tfidf features: %d" % X_train_tfidf.get_shape()[0])

Loading dataset...
2400 2400
1600 1600
done


In [4]:
print("\nPerforming dimensionality reduction using LSA")
t0 = time.time()

# Project the tfidf vectors onto the first N principal components.
# Though this is significantly fewer features than the original tfidf vector,
# they are stronger features, and the accuracy is higher.
svd = TruncatedSVD(100)
lsa = make_pipeline(svd, Normalizer(copy=False))

# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

print("  done in %.3fsec" % (time.time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("  Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))


# Now apply the transformations to the test data as well.
X_test_tfidf = vectorizer.transform(X_test_raw).toarray()
X_test_lsa = lsa.transform(X_test_tfidf)


###############################################################################
#  Run classification of the test articles
###############################################################################

print("\nClassifying LSA vectors...")

# Time this step.
t0 = time.time()

# Build a k-NN classifier. Use k = 5 (majority wins), the cosine distance, 
# and brute-force calculation of distances.
knn_lsa = KNeighborsClassifier(n_neighbors=5,metric='cosine')
knn_lsa.fit(X_train_lsa, y_train)

# Classify the test vectors.
p = knn_lsa.predict(X_test_lsa)



# Measure accuracy
numRight = 0;
for i in range(0,len(p)):
    if p[i] == y_test[i]:
        numRight += 1

print("  (%d / %d) correct - %.2f%%" % (numRight, len(y_test), float(numRight) / float(len(y_test)) * 100.0))

# Calculate the elapsed time (in seconds)
elapsed = (time.time() - t0)    
print("    done in %.3fsec" % elapsed)

y_pred = p
print("========================TFIDF + LSA + Cosine======================")
print(classification_report(y_test, y_pred, target_names=categories))
cm = pd.DataFrame(confusion_matrix(y_test, y_pred),index=['comp.graphics:true', 'sci.space:true','rec.sport.baseball:true','sci.electronics:true'],columns=['comp.graphics:pred', 'sci.space:pred','rec.sport.baseball:pred','sci.electronics:pred'])
#cm = pd.DataFrame(pd.crosstab(y_test,y_pred),index=categories,columns=categories)
cw = cm.sum(axis=0)
row_df = pd.DataFrame([cw],index=["All"])
cm = pd.concat([ cm,row_df])
cm["All"] = cm.sum(axis=1)
print("confusion matrix:\n",cm)
print("accuracy:",metrics.accuracy_score(y_test, y_pred)*100)



Performing dimensionality reduction using LSA
  done in 2.433sec
  Explained variance of the SVD step: 16%

Classifying LSA vectors...
  (599 / 1600) correct - 37.44%
    done in 0.216sec
                  precision    recall  f1-score   support

action_adventure       0.39      0.59      0.47       453
   drama_romance       0.36      0.46      0.40       446
   comedy_family       0.38      0.23      0.29       409
 horror_thriller       0.36      0.11      0.17       292

        accuracy                           0.37      1600
       macro avg       0.37      0.35      0.33      1600
    weighted avg       0.37      0.37      0.35      1600

confusion matrix:
                          comp.graphics:pred  sci.space:pred  \
comp.graphics:true                      268             104   
sci.space:true                          162             204   
rec.sport.baseball:true                 144             155   
sci.electronics:true                    119             105   
All       