# Text Classification Exercise

Jeopardy Questions download:
https://drive.google.com/file/d/0BwT5wj_P7BKXUl9tOUJWYzVvUjA/view?usp=sharing

In [15]:
import os
import warnings
import sys

import pandas as pd
import numpy as np

import re
from nltk import *
#nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

### Read in data

In [11]:
df = pd.read_csv("C:\\Users\\MaryWillcock\\Documents\\GitHub\\NLP_Flask_Test\\JEOPARDY_CSV.csv")
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


### Pull out features and labels from DF

The features are the question column of the DF(dataframe). This is the base text that we will classify. The labels are the Category column of the DF.

We are going to do unsupervised learning through K-means clustering. However, we do have data that is labeled with a category, so we can compare the category label with the natural clustering.

In [12]:
features = df.iloc[:, 5].values
labels = df.iloc[:, 3].values

### Text Cleaning

The function below cleans up the text first, then vectorizes the cleaned up text. 

In [20]:
def clean_n_vect(feat):
    processed_feats = []

    for sentence in range(0, len(feat)):
        # Remove all the special characters
        processed_feature = re.sub(r'\W', ' ', str(feat[sentence]))

        # remove all single characters
        processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

        # Remove single characters from the start
        processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

        # Substituting multiple spaces with single space
        processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

        # Removing prefixed 'b'
        processed_feature = re.sub(r'^b\s+', '', processed_feature)

        # Converting to Lowercase
        processed_feature = processed_feature.lower()

        processed_feats.append(processed_feature)
    vectorizer = TfidfVectorizer (max_features=2300, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
    processed_feats = vectorizer.fit_transform(processed_feats).toarray()
    return processed_feats, vectorizer

In [21]:
processed_features, vectorizer = clean_n_vect(features)
X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)

In [None]:
true_k = 10
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(processed_features)

In [None]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

In [None]:
for i in range(true_k):
    print('Cluster %d:' % i),
    for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind])