In [69]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns

# Clustering packages
import sklearn.cluster as cluster
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AffinityPropagation

# Natural Language processing
import re
import nltk
from nltk.corpus import product_reviews_1, product_reviews_2, stopwords
from nltk.corpus import twitter_samples, gazetteers
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Machine Learning packages
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [64]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [22]:
print(product_reviews_1.fileids(), '\n\n', product_reviews_2.fileids())

['Apex_AD2600_Progressive_scan_DVD player.txt', 'Canon_G3.txt', 'Creative_Labs_Nomad_Jukebox_Zen_Xtra_40GB.txt', 'Nikon_coolpix_4300.txt', 'Nokia_6610.txt', 'README.txt'] 

 ['Canon_PowerShot_SD500.txt', 'Canon_S100.txt', 'Diaper_Champ.txt', 'Hitachi_router.txt', 'Linksys_Router.txt', 'MicroMP3.txt', 'Nokia_6600.txt', 'README.txt', 'ipod.txt', 'norton.txt']


In [62]:
# Create list with all product review files
product_reviews = product_reviews_1.fileids() + product_reviews_2.fileids()
# Remove readme from list
product_reviews = [product for product in product_reviews if product != 'README.txt']

# View name of all products
product_reviews

['Apex_AD2600_Progressive_scan_DVD player.txt',
 'Canon_G3.txt',
 'Creative_Labs_Nomad_Jukebox_Zen_Xtra_40GB.txt',
 'Nikon_coolpix_4300.txt',
 'Nokia_6610.txt',
 'Canon_PowerShot_SD500.txt',
 'Canon_S100.txt',
 'Diaper_Champ.txt',
 'Hitachi_router.txt',
 'Linksys_Router.txt',
 'MicroMP3.txt',
 'Nokia_6600.txt',
 'ipod.txt',
 'norton.txt']

In [37]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [61]:
reviews_1 =""
for product in product_reviews_1.fileids():
    summary = product_reviews_1.raw(product)
    reviews_1 = reviews_1 + summary

reviews_2 =""
for product in product_reviews_2.fileids():
    summary = product_reviews_2.raw(product)
    reviews_2 = reviews_2 + summary
   
reviews_clean = text_cleaner(reviews_1 + reviews_2  )
review_doc = nlp(reviews_clean)

In [63]:
sentences = []
for sentence in review_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    sentences.append(sentence)
    
print(sentences[20])
print('We have {} sentences and {} tokens.'.format(len(sentences), len(reviews_clean)))

['picture', 'scroll', 'b', 'w']
We have 11657 sentences and 802087 tokens.


In [66]:
sentences.head()

AttributeError: 'list' object has no attribute 'head'

In [40]:
# Utility function to create a list of the 200 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(200)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

In [41]:
for product in product_reviews_1.fileids():
    summary = product_reviews_1.raw(product)
    reviews_1 = reviews_1 + summary

apexwords = bag_of_words(apex_doc)

In [42]:
word_counts = bow_features(sentences, apexwords)
word_counts.head()

Processing row 0
Processing row 500


Unnamed: 0,dvd,player,play,apex,buy,work,n't,problem,good,try,...,design,receive,s,message,chapter,rw,plate,happy,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,(*),apex
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(*, *, *, *, *, *, *, *, *, *, *, *, *, *, *, ...",apex
2,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(*, Department, of, Computer, Sicence, *, Univ...",apex
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(*, Review, Source, :, amazon.com)",apex
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,(*),apex


In [67]:
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)

In [70]:
bow_rfc = ensemble.RandomForestClassifier()
train = bow_rfc.fit(X_train, y_train)

print('Training set score:', bow_rfc.score(X_train, y_train))
print('\nTest set score:', bow_rfc.score(X_test, y_test))

Training set score: 1.0

Test set score: 1.0
