In [194]:
# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import pandas as pd       
train = pd.read_csv("test.csv")

In [195]:
train.shape

(5200, 4)

In [196]:
train.columns.values

array(['id', 'title', 'author', 'text'], dtype=object)

In [197]:
from bs4 import BeautifulSoup  

In [198]:
example1 = BeautifulSoup(train['text'][0])  

# Print the raw review and then the output of get_text(), for 
# comparison

print(example1.get_text() ) 

PALO ALTO, Calif.  —   After years of scorning the political process, Silicon Valley has leapt into the fray. The prospect of a President Donald J. Trump is pushing the tech community to move beyond its traditional role as donors and to embrace a new existence as agitators and activists. A distinguished venture capital firm emblazoned on its corporate home page an earthy   epithet. One prominent tech chieftain says the consequences of Mr. Trump’s election would “range between disastrous and terrible. ” Another compares him to a dictator. And nearly 150 tech leaders signed an open letter decrying Mr. Trump and his campaign of “anger” and “bigotry. ” Not quite all the action is  . Peter Thiel, a founder of PayPal and Palantir who was the first outside investor in Facebook, spoke at the Republican convention in July. The New York Times reported on Saturday that Mr. Thiel is giving $1. 25 million to support Mr. Trump’s candidacy even as other supporters flee. (He also recently gave $1 mill

In [199]:
import re
# Use regular expressions to do a find-and-replace
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      example1.get_text() )  # The text to search


lower_case = letters_only.lower()        # Convert to lower case
words = lower_case.split()  
print(letters_only)

PALO ALTO  Calif       After years of scorning the political process  Silicon Valley has leapt into the fray  The prospect of a President Donald J  Trump is pushing the tech community to move beyond its traditional role as donors and to embrace a new existence as agitators and activists  A distinguished venture capital firm emblazoned on its corporate home page an earthy   epithet  One prominent tech chieftain says the consequences of Mr  Trump s election would  range between disastrous and terrible    Another compares him to a dictator  And nearly     tech leaders signed an open letter decrying Mr  Trump and his campaign of  anger  and  bigotry    Not quite all the action is    Peter Thiel  a founder of PayPal and Palantir who was the first outside investor in Facebook  spoke at the Republican convention in July  The New York Times reported on Saturday that Mr  Thiel is giving        million to support Mr  Trump s candidacy even as other supporters flee   He also recently gave    mill

In [200]:
import nltk


In [201]:
from nltk.corpus import stopwords # Import the stop word list


In [202]:
words = [w for w in words if not w in stopwords.words("english")]
print (words)

['palo', 'alto', 'calif', 'years', 'scorning', 'political', 'process', 'silicon', 'valley', 'leapt', 'fray', 'prospect', 'president', 'donald', 'j', 'trump', 'pushing', 'tech', 'community', 'move', 'beyond', 'traditional', 'role', 'donors', 'embrace', 'new', 'existence', 'agitators', 'activists', 'distinguished', 'venture', 'capital', 'firm', 'emblazoned', 'corporate', 'home', 'page', 'earthy', 'epithet', 'one', 'prominent', 'tech', 'chieftain', 'says', 'consequences', 'mr', 'trump', 'election', 'would', 'range', 'disastrous', 'terrible', 'another', 'compares', 'dictator', 'nearly', 'tech', 'leaders', 'signed', 'open', 'letter', 'decrying', 'mr', 'trump', 'campaign', 'anger', 'bigotry', 'quite', 'action', 'peter', 'thiel', 'founder', 'paypal', 'palantir', 'first', 'outside', 'investor', 'facebook', 'spoke', 'republican', 'convention', 'july', 'new', 'york', 'times', 'reported', 'saturday', 'mr', 'thiel', 'giving', 'million', 'support', 'mr', 'trump', 'candidacy', 'even', 'supporters', 

In [203]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))     

In [204]:
clean_review = review_to_words( train["text"][0] )
print (clean_review)

palo alto calif years scorning political process silicon valley leapt fray prospect president donald j trump pushing tech community move beyond traditional role donors embrace new existence agitators activists distinguished venture capital firm emblazoned corporate home page earthy epithet one prominent tech chieftain says consequences mr trump election would range disastrous terrible another compares dictator nearly tech leaders signed open letter decrying mr trump campaign anger bigotry quite action peter thiel founder paypal palantir first outside investor facebook spoke republican convention july new york times reported saturday mr thiel giving million support mr trump candidacy even supporters flee also recently gave million super pac supports senator rob portman republican freshman running ohio getting involved politics used seen clashing silicon valley value system transform world making problems obsolete solving washington entrepreneurs want alienate whatever segment customers 

In [205]:
# Get the number of reviews based on the dataframe column size
num_reviews = train["text"].size


# Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in range( 0, 589 ):
   
    # Call our function for each one, and add the result to the list of
    # clean reviews

    clean_train_reviews.append( review_to_words( train["text"][i] ) )
    
print(clean_train_reviews)
    
 



In [206]:
print ("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(max_features = 1000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()
print (train_data_features.shape)

Creating the bag of words...

(589, 1000)


In [207]:
vocab = vectorizer.get_feature_names()
print (vocab)

['ability', 'able', 'access', 'according', 'accused', 'across', 'act', 'action', 'actually', 'added', 'adding', 'addition', 'additional', 'address', 'administration', 'admiral', 'age', 'agency', 'agents', 'ago', 'agreed', 'agreement', 'air', 'airport', 'al', 'allies', 'allowed', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'america', 'american', 'americans', 'among', 'announced', 'another', 'answer', 'anti', 'anyone', 'anything', 'appeared', 'approach', 'april', 'area', 'areas', 'armed', 'arms', 'army', 'around', 'arrested', 'arrived', 'art', 'article', 'ask', 'asked', 'assad', 'association', 'athletes', 'attack', 'attacks', 'attention', 'attorney', 'australia', 'author', 'authorities', 'authority', 'available', 'away', 'back', 'bad', 'bank', 'banks', 'barack', 'based', 'beach', 'became', 'become', 'began', 'behind', 'believe', 'believed', 'best', 'better', 'beyond', 'big', 'biggest', 'bill', 'billion', 'black', 'blood', 'board', 'body', 'book', 'books', 'brain'

In [208]:
import numpy as np

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print (count, tag)
    


49 ability
121 able
79 access
267 according
75 accused
128 across
100 act
76 action
119 actually
153 added
54 adding
59 addition
50 additional
52 address
184 administration
70 admiral
59 age
115 agency
75 agents
135 ago
57 agreed
57 agreement
155 air
66 airport
68 al
56 allies
57 allowed
119 almost
55 alone
103 along
138 already
735 also
72 although
135 always
239 america
435 american
146 americans
195 among
91 announced
305 another
60 answer
80 anti
86 anyone
88 anything
66 appeared
48 approach
51 april
88 area
62 areas
49 armed
49 arms
101 army
234 around
67 arrested
48 arrived
79 art
131 article
74 ask
171 asked
53 assad
50 association
48 athletes
151 attack
88 attacks
61 attention
72 attorney
48 australia
59 author
69 authorities
95 authority
57 available
142 away
323 back
65 bad
73 bank
49 banks
68 barack
99 based
54 beach
119 became
186 become
144 began
112 behind
148 believe
51 believed
143 best
130 better
52 beyond
146 big
52 biggest
127 bill
90 billion
165 black
53 blood
86 bo

50 whatever
167 whether
326 white
70 whole
116 whose
70 wife
90 win
49 wine
123 within
197 without
156 woman
273 women
98 words
285 work
69 worked
93 workers
128 working
71 works
410 world
1077 would
69 written
87 wrong
140 wrote
54 www
450 year
593 years
58 yes
151 yet
246 york
126 young


In [209]:
# Read the test data
test = pd.read_csv("train.csv")

# Verify that there are 25,000 rows and 2 columns
print (test.shape)

# Create an empty list and append the clean reviews one by one
num_reviews = len(test["text"])
clean_test_reviews = [] 

print ("Cleaning and parsing the test set movie reviews...\n")
for i in range(0,num_reviews):
    if( (i+1) % 1000 == 0 ):
        print ("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words( test["text"][i] )
    clean_test_reviews.append( clean_review )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )


(20800, 5)
Cleaning and parsing the test set movie reviews...



TypeError: object of type 'float' has no len()