In [1]:
import pandas as pd    
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_data = pd.read_csv('words_datasets/labeledTrainData.tsv.zip', delimiter="\t")
test_data = pd.read_csv('words_datasets/testData.tsv.zip', delimiter="\t")

In [3]:
train_data

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...
24996,5064_1,0,I don't believe they made this film. Completel...
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil..."
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...


In [4]:
test_data

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...
...,...,...
24995,2155_10,"Sony Pictures Classics, I'm looking at you! So..."
24996,59_10,I always felt that Ms. Merkerson had never got...
24997,2531_1,I was so disappointed in this movie. I am very...
24998,7772_8,"From the opening sequence, filled with black a..."


In [5]:
# Шаг 1. Добавил разграничение для тестовых и тренировочных данных
train_data['is_test'] = 0
test_data['is_test'] = 1

In [6]:
# Шаг 2. Объединим данные
dataset = train_data.append(test_data, ignore_index=True)
dataset

Unnamed: 0,id,sentiment,review,is_test
0,5814_8,1.0,With all this stuff going down at the moment w...,0
1,2381_9,1.0,"\The Classic War of the Worlds\"" by Timothy Hi...",0
2,7759_3,0.0,The film starts with a manager (Nicholas Bell)...,0
3,3630_4,0.0,It must be assumed that those who praised this...,0
4,9495_8,1.0,Superbly trashy and wondrously unpretentious 8...,0
...,...,...,...,...
49995,2155_10,,"Sony Pictures Classics, I'm looking at you! So...",1
49996,59_10,,I always felt that Ms. Merkerson had never got...,1
49997,2531_1,,I was so disappointed in this movie. I am very...,1
49998,7772_8,,"From the opening sequence, filled with black a...",1


In [7]:
# Шаг 3. Проверим пропуски
dataset.isnull().sum()

id               0
sentiment    25000
review           0
is_test          0
dtype: int64

In [8]:
def clean_string(text):
    text = BeautifulSoup(text).get_text()  # удаление html тегов
    letters_only = re.sub("[^a-zA-Z]", " ", text)  # удаление символов помимо букв
    words = letters_only.lower().split() # перевод в нижний регистр                           
    stops = set(stopwords.words("english"))                  
    main_words = [w for w in words if not w in stops] # исключение стоп слов   
    return " ".join(main_words)

In [9]:
# Шаг 4. Очищаем данные с помощью функции clean string
dataset['review_clean'] = dataset['review'].apply(clean_string)

In [10]:
# Шаг 5. Разделяем данные на тестовые и тренировочные
train_data = dataset.query("is_test == 0")
test_data = dataset.query("is_test == 1")

train_data = train_data.drop(['is_test'], axis=1)
test_data = test_data.drop(['is_test', 'sentiment'], axis=1)

In [11]:
test_data

Unnamed: 0,id,review,review_clean
25000,12311_10,Naturally in a film who's main themes are of m...,naturally film main themes mortality nostalgia...
25001,8348_2,This movie is a disaster within a disaster fil...,movie disaster within disaster film full great...
25002,5828_4,"All in all, this is a movie for kids. We saw i...",movie kids saw tonight child loved one point k...
25003,7186_2,Afraid of the Dark left me with the impression...,afraid dark left impression several different ...
25004,12128_7,A very accurate depiction of small time mob li...,accurate depiction small time mob life filmed ...
...,...,...,...
49995,2155_10,"Sony Pictures Classics, I'm looking at you! So...",sony pictures classics looking sony got rights...
49996,59_10,I always felt that Ms. Merkerson had never got...,always felt ms merkerson never gotten role fit...
49997,2531_1,I was so disappointed in this movie. I am very...,disappointed movie familiar case read mark fuh...
49998,7772_8,"From the opening sequence, filled with black a...",opening sequence filled black white shots remi...


In [12]:
# Шаг 6. Создаем векторизатор
vectorizer = CountVectorizer(analyzer = "word", max_features = 9000) 

In [13]:
# Шаг 6.1 Векторизируем тренировочные данные
x = vectorizer.fit_transform(train_data.review_clean)
x = x.toarray()
y = train_data.sentiment

In [14]:
# Шаг 6.2 Векторизируем тестовые данные
x_test = vectorizer.transform(test_data.review_clean)
x_test = x_test.toarray()

In [15]:
# Шаг 7. Обучаем рандомный лес
random_forest = RandomForestClassifier(n_estimators = 90)
random_forest.fit(x, y)

RandomForestClassifier(n_estimators=90)

In [16]:
# Шаг 8. Делаем предикт 
predict = random_forest.predict(x_test)

In [17]:
predict_fo_csv = pd.DataFrame(data={
    "id": test_data["id"], 
    "sentiment": predict
})
predict_fo_csv.to_csv("words_done.csv", index=False)