In [108]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [109]:
df_texts = pd.Series(pd.read_fwf('train.texts', sep="\n", header=None)[0], name='review')
df_labels = pd.Series(pd.read_fwf('train.labels', sep="\n", header=None)[0], name='sentiment')
df = pd.concat([df_texts, df_labels], axis=1)
df_x_test = pd.read_csv('texts.csv')['texts'].to_frame()

df.replace("pos", 1, inplace=True)
df.replace("neg", 0, inplace=True)

In [112]:
def data_processing(text):
    text= text.lower()
    text = re.sub('<br />', '', text)
    text = re.sub(r"https\S+|www\S+|http\S+", '', text, flags = re.MULTILINE)
    text = re.sub(r'\@w+|\#', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text_tokens = word_tokenize(text)
    filtered_text = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_text)

In [113]:
df.review = df['review'].apply(data_processing)
df_x_test.texts = df_x_test['texts'].apply(data_processing)

In [114]:
df = df.drop_duplicates('review')

In [115]:
stemmer = PorterStemmer()
def stemming(data):
    text = [stemmer.stem(word) for word in data]
    return data

In [116]:
df.review = df['review'].apply(lambda x: stemming(x))
df_x_test.texts = df_x_test['texts'].apply(lambda x: stemming(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.review = df['review'].apply(lambda x: stemming(x))


In [117]:
x_train = df['review']
y_train = df['sentiment']
x_test = df_x_test['texts']

vect = TfidfVectorizer(max_df=0.8, ngram_range=(1,3))
x_train = vect.fit_transform(x_train)
x_test = vect.transform(x_test)

print("Size of x_train: ", (x_train.shape))
print("Size of y_train: ", (y_train.shape))
print("Size of x_test: ", (x_test.shape))

In [120]:
from sklearn.linear_model import PassiveAggressiveClassifier

passive = PassiveAggressiveClassifier()
passive.fit(x_train, y_train)
pred = passive.predict(x_test)

In [121]:
y_pred = pd.DataFrame(data=range(0,10000))
y_pred.columns = ['id']
y_pred['labels'] = pred
y_pred.labels.replace(1, 'pos', inplace=True)
y_pred.labels.replace(0, 'neg', inplace=True)

In [123]:
y_pred.to_csv('y_pred.csv', index=False)