In [1]:
#importing the libraries
import bz2
import pandas as pd
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from langdetect import detect

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mcand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Reading the files

In [2]:
#reading the bz2 file and transforming it to a dataframe

train_file = bz2.BZ2File("train.ft.txt.bz2")    
line_list = train_file.readlines()    
lines = [x.decode('utf-8') for x in line_list]    

# Split in two: sentiment and review    
sentiment = [review.split("__label__")[1][0] for review in lines]
reviews = [review.split("__label__")[1][1:]  for review in lines]
   
newlist = []
    
for i in range(len(sentiment)):
    
    newlist.append([sentiment[i], reviews[i]])

df = pd.DataFrame(newlist, columns = ['score', 'review'] )


### Removing punctuation
The preprocessed reviews are further cleaned by dropping punctuations. Using regular expressions, only whitespaces and alphanumeric characters are kept.

In [3]:
#removing punctation
pattern = r"[^\w\s]"
df["review"] = df["review"].str.replace(pat=pattern, repl=" ", regex=True)

### Converting to lowercase
Every letter is also converted to lower case. This makes it so that "iPhone" will not be distinguishable from "iphone".

In [4]:
#converting to lowercase
df["review"] = df["review"].str.lower()

### Removing stop words


Stop words consist of the most commonly used words that include pronouns (e.g. us, she, their), articles (e.g. the), and prepositions (e.g. under, from, off). These words are not helpful in distinguishing a document from another and are therefore dropped.

Note that the stop_words were stripped of punctuations just as what we have done to our dataset.

In [5]:
stop_words = stopwords.words("english")

stop_words = [word.replace("\'", "") for word in stop_words]

remove_stop_words = lambda row: " ".join([token for token in row.split(" ") \
                                          if token not in stop_words])
    
df["review"] = df["review"].apply(remove_stop_words)

### Removing extra spaces

Again, we make use of regular expressions to ensure we never get more than a single whitespace to separate words in our sentences.

In [6]:
#removing extra spaces
pattern = r"[\s]+"
df["review"] = df["review"].str.replace(pat=pattern, repl=" ", regex=True)

### Number of tokens in each review

adding extra features to the dataset

In [7]:
#number of tokens
df['n_tokens'] = df['review'].str.lower().str.split().apply(len)

### Sampling + detecting the language of each review

the dataset is actually huge and taking a lot of computational time to execute the language detection call. I decided to take a random sample of the dataset with only around 50000 records just for the sake of the assignment and time consumption.

In [8]:
#sampling
df2 = df.sample(n=50000)
#detect language
df2['language'] = df2['review'].apply(detect)

In [9]:
df2.head()

Unnamed: 0,score,review,n_tokens,language
149741,1,battery drains really fast charged thing day ...,14,en
531604,1,poor blu ray transfer fans film warned absolu...,63,en
1694190,2,energetic fun classical music expert stretch ...,56,en
910848,2,lovely tablecloth 70 x 86 oval tablecloth siz...,30,en
3446703,1,company unresponsive placed order oct 4th 201...,31,en


In [10]:
X = df2[['review','n_tokens','language']]
y = df2[['score']]

### Train test split

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Tokenization using CountVectorizer

The classical approach in expressing text as a set of features is getting the token frequency. Each entry to the dataframe is a document while each column corresponds to every unique token in the entire corpora. The row will identify how many times a word appears in the document.

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=1000, min_df=5, max_df=0.7)
train_vector = vectorizer.fit_transform(X_train["review"])
test_vector = vectorizer.transform(X_test["review"])

### Recreating the dataframes

In [13]:

train_df = pd.DataFrame(train_vector.A, columns=vectorizer.get_feature_names())
train_df['n_tokens'] = X_train['n_tokens'].values
train_df['language'] = X_train['language'].values
train_df['language_flag'] = train_df.language.apply(lambda language: 1 if 'en' in language else 0)

train_df = train_df.drop(['language'], axis=1)


test_df = pd.DataFrame(test_vector.A, columns=vectorizer.get_feature_names())
test_df['n_tokens'] = X_test['n_tokens'].values
test_df['language'] = X_test['language'].values
test_df['language_flag'] = test_df.language.apply(lambda language: 1 if 'en' in language else 0)

test_df = test_df.drop(['language'], axis=1)


### Classification using logistic regression

In [17]:
from sklearn.linear_model import LogisticRegression

clr = LogisticRegression()
clr.fit(train_df, y_train)

y_pred = clr.predict(test_df)


  y = column_or_1d(y, warn=True)


### Model reporting and accuracy

In [21]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#confusion matrix
print(confusion_matrix(y_test,y_pred))

[[4241  817]
 [ 660 4282]]


In [22]:
#classification report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.87      0.84      0.85      5058
           2       0.84      0.87      0.85      4942

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [23]:
#accuracy score
print(accuracy_score(y_test, y_pred))

0.8523
