In [1]:
import gzip

with gzip.open('movie_data.csv.gz', 'r') as gz:
    text = gz.readlines()
with open('movie_data.csv', 'w') as f:
    f.writelines([l.decode('utf-8') for l in text])

In [2]:
import pandas as pd

df = pd.read_csv('movie_data.csv')
df.columns = ['review', 'sentiment']

In [3]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [4]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"Election is a Chinese mob movie, or triads in ...",1
1,I was just watching a Forensic Files marathon ...,0
2,Police Story is a stunning series of set piece...,1


In [5]:
df.shape

(50000, 2)

In [6]:
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining, the weather is sweet, '
                 'and one and one is two'])
bag = count.fit_transform(docs)
print(count.vocabulary_)
print(bag.toarray())

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [7]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [8]:
df.loc[0, 'review'][-50:]

'nd three more acting performances (including Yam).'

In [9]:
from commons import preprocessor

preprocessor(df.loc[0, 'review'][-50:])

'nd three more acting performances including yam '

In [14]:
preprocessor("</a>This :) is : ( а test :-) !")

'this is а test : ('

In [15]:
df['review'] = df['review'].apply(preprocessor)