<a href="https://colab.research.google.com/github/mohanrajmit/ML-training/blob/master/count_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Vectorizing Raw Data: Count Vectorization

### Count vectorization 

Creates a document-term matrix where the entry of each cell will be a count of the number of times that word occurred in that document.

In [2]:
!git clone  https://github.com/mohanrajmit/Sentiment-Analsysis.git

fatal: destination path 'Sentiment-Analsysis' already exists and is not an empty directory.


### Read in text

In [4]:
import pandas as pd
import re
import string
import nltk
nltk.download('stopwords')
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("/content/Sentiment-Analsysis/train.csv")
#data.columns = ['label', 'body_text']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #iger...
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias…...
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connec...
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr....
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19....


### Create function to remove punctuation, tokenize, remove stopwords, and stem

In [6]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

### Apply CountVectorizer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(data['tweet'])
print(X_counts.shape)
print(count_vect.get_feature_names())

(7920, 20338)
['', '0', '000', '00000', '002', '004', '0051', '007', '008', '01', '010111', '0101am', '0101ammay', '010315', '011', '01204', '015', '01634', '01924', '02', '0200am', '0230am', '0230pm', '02aug2017', '03', '0300am', '0301am', '0301amjun', '0301pm', '0301pmjune', '0330am', '0330amjun', '0340pm', '035mm', '04', '0400am', '0400pm', '0430pm', '0490love', '05', '0530am', '06', '0600am', '0630am', '069', '07', '070', '0700', '0700am', '0700pm', '0701', '0701am', '0701amapril', '0730pm', '0777', '07yilmaz242', '08', '0800am', '0800pm', '0801ammay', '0801pm', '0801pmapril', '0801pmjune', '08184820028', '0830am', '0830pm', '08t', '09', '0900am', '0900pm', '0930am', '099', '0g', '0ä1ö', '1', '10', '100', '1000', '10000', '1000am', '100400', '100happyday', '100odd', '101', '10104', '1011', '1015', '102', '1020', '1021', '102409', '1030am', '1030pm', '1031', '104', '104x50x60', '105', '1057', '106', '1068', '106finalact', '107', '1080p', '10gb', '10it', '10like', '10mpx', '10pc', '1

### Apply CountVectorizer to smaller sample

In [8]:
data_sample = data[0:20]

count_vect_sample = CountVectorizer(analyzer=clean_text)
X_counts_sample = count_vect_sample.fit_transform(data_sample['tweet'])
print(X_counts_sample.shape)
print(count_vect_sample.get_feature_names())

(20, 197)
['1995', '2018', '2inch', '3d', '5c', 'agre', 'amaz', 'amazon', 'android', 'anoth', 'app', 'appl', 'ball', 'bandwagon', 'batteri', 'bay', 'beauti', 'big', 'bigd', 'blackberrypictwittercomzpggd7cazn', 'bout', 'boy', 'c', 'cabl', 'camera', 'case', 'cellcom', 'charg', 'charger', 'color', 'connect', 'contempl', 'crash', 'crazi', 'cross', 'cute', 'dalla', 'daventri', 'dead', 'deepellum', 'depress', 'design', 'dont', 'downtown', 'drinkyourhaterad', 'drop', 'etsi', 'evemun', 'even', 'everi', 'fact', 'final', 'fingerprint', 'five', 'followforfollow', 'fuck', 'fuckin', 'fuckingpiss', 'fun', 'georg', 'give', 'go', 'gunna', 'ha', 'happi', 'hard', 'hateorang', 'health', 'heavi', 'hey', 'home', 'httpfbme6n3lsupcu', 'httpinstagramcompd0r5sqry5b', 'httpinstagramcomppzjooaqqz4', 'httpinstagramcompyget5jc6jm', 'httpinstagrampli5ujs4k', 'httpinstagrampnuyfhvye7i', 'httpinstagrampvj6bg5tlql', 'httpsbestcheapphonescomindexphpproduct2016newsweet3dmineralwaterbottleloveicecreamphonecasecouplessoft

### Vectorizers output sparse matrices

_**Sparse Matrix**: A matrix in which most entries are 0. In the interest of efficient storage, a sparse matrix will be stored by only storing the locations of the non-zero elements._

In [9]:
X_counts_sample

<20x197 sparse matrix of type '<class 'numpy.int64'>'
	with 232 stored elements in Compressed Sparse Row format>

In [10]:
X_counts_df = pd.DataFrame(X_counts_sample.toarray())
X_counts_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196
0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0
7,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [11]:
X_counts_df.columns = count_vect_sample.get_feature_names()
X_counts_df

Unnamed: 0,1995,2018,2inch,3d,5c,agre,amaz,amazon,android,anoth,app,appl,ball,bandwagon,batteri,bay,beauti,big,bigd,blackberrypictwittercomzpggd7cazn,bout,boy,c,cabl,camera,case,cellcom,charg,charger,color,connect,contempl,crash,crazi,cross,cute,dalla,daventri,dead,deepellum,...,suck,summer,sun,support,talk,taylor,technolog,test,thank,theori,thinner,time,tobi,toward,transpar,truth,truthbetold,tweegram,type,uk,uncl,unless,unplug,updat,ur,urban,us,want,water,way,wifi,wire,wont,work,would,xperia,xperiaz,yay,year,young
0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0
7,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
