In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [2]:
# Read the CSV file containing the IMDB movie reviews dataset into a DataFrame.
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')


In [3]:
# Count the occurrences of each unique value in the 'sentiment' column of the DataFrame and display the result.
df['sentiment'].value_counts()


sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [4]:
# Check for missing values in each column of the DataFrame and display the total count of missing values for each column.
df.isnull().sum()


review       0
sentiment    0
dtype: int64

In [5]:
# Count the number of duplicate rows in the DataFrame and display the result.
df.duplicated().sum()


418

In [6]:
# Remove duplicate rows from the DataFrame and update the DataFrame in place.
df.drop_duplicates(inplace=True)


In [7]:
# Convert all text in the 'review' column to lowercase.
df['review'] = df['review'].str.lower()


In [8]:
import re

def remove_html_tags(text):
    # Remove HTML tags from the given text if it is a string.
    if isinstance(text, str):
        pattern = re.compile('<.*?>')  # Regular expression to match HTML tags
        return pattern.sub(r'', text)  # Replace HTML tags with an empty string
    else:
        return text  # Return the text as is if it's not a string

# Apply the remove_html_tags function to the 'review' column to clean the text data.
df['review'] = df['review'].apply(remove_html_tags)


In [9]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


In [10]:
def remove_url(text):
    # Remove URLs from the given text if it is a string.
    if isinstance(text, str):
        pattern = re.compile(r'https?://\S+|www\.\S+')
        return pattern.sub(r'', text)
    else:
        return text

# Apply the remove_url function to the 'review' column to remove URLs from the text data.
df['review'] = df['review'].apply(remove_url)


In [11]:
def remove_punc(text, exclude):
    # Remove punctuation characters from the given text based on the specified exclusion list.
    for char in exclude:
        text = text.replace(char, '')  # Replace each punctuation character with an empty string
    return text

exclude = '''!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~'''  # Define a string of punctuation characters to exclude
# Apply the remove_punc function to the 'review' column, excluding the specified punctuation characters.
df['review'] = df['review'].apply(lambda x: remove_punc(x, exclude))


In [12]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,i am a catholic taught in parochial elementary...,negative
49998,im going to have to disagree with the previous...,negative


In [13]:
from nltk.corpus import stopwords
import nltk

# Download the stopwords corpus if not already downloaded
nltk.download('stopwords')

# Get the set of English stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    # Remove stopwords from the given text if it is a string.
    if isinstance(text, str):
        new_text = []
        for word in text.split():
            if word.lower() not in stop_words:
                new_text.append(word)
        return " ".join(new_text)
    else:
        return text


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# Apply the remove_stopwords function to the 'review' column to remove stopwords from the text data.
df['review'] = df['review'].apply(remove_stopwords)


In [15]:
# Create a new DataFrame 'df2' by sampling 25,000 random rows from the original DataFrame 'df'.
df2 = df.sample(25000)


In [16]:
# Count the occurrences of each unique value in the 'sentiment' column of the DataFrame 'df2' and display the result.
df2['sentiment'].value_counts()


sentiment
positive    12591
negative    12409
Name: count, dtype: int64

In [17]:
# Create features (X) and target variable (y) from the DataFrame 'df2'.
# X contains all rows and the first column of 'df2' (excluding the 'sentiment' column).
X = df2.iloc[:, 0:1]

# y contains the 'sentiment' column of 'df2'.
y = df2['sentiment']


In [18]:
y

38100    negative
26692    positive
39338    negative
906      positive
2455     positive
           ...   
4969     positive
34640    positive
9622     negative
25044    positive
2602     positive
Name: sentiment, Length: 25000, dtype: object

In [19]:
from sklearn.preprocessing import LabelEncoder

# Initialize a LabelEncoder object
encoder = LabelEncoder()

# Encode the target variable 'y' into numerical values
y = encoder.fit_transform(y)


In [20]:
from sklearn.preprocessing import LabelEncoder

# Initialize a LabelEncoder object
encoder = LabelEncoder()

# Encode the target variable 'y' into numerical values
y = encoder.fit_transform(y)


In [21]:
y

array([0, 1, 0, ..., 0, 1, 1])

In [22]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets, with 80% for training and 20% for testing, using a random state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [23]:
X_train.shape

(20000, 1)

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize a CountVectorizer object to convert text data into a matrix of token counts
cv = CountVectorizer()


In [26]:
# Convert the text data in the 'review' column of X_train into a matrix of token counts
# and transform the text data in the 'review' column of X_test using the same vocabulary
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()


In [27]:
X_train_bow.shape

(20000, 125268)

In [29]:
import time

In [30]:
import time
from sklearn.naive_bayes import GaussianNB

# Start timing the training process
start_time = time.time()

# Initialize and train a Gaussian Naive Bayes model using the bag-of-words features
gnb = GaussianNB()
gnb.fit(X_train_bow, y_train)

# End timing the training process and calculate the total time taken
end_time = time.time()
print(f"Total time: {end_time - start_time:.2f} seconds")


Total time: 49.33 seconds


In [31]:
# Start timing the prediction process
start_time = time.time()

# Predict the labels for the test set using the trained Gaussian Naive Bayes model
y_pred = gnb.predict(X_test_bow)

# Calculate the accuracy score of the model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

# End timing the prediction process and calculate the total time taken
end_time = time.time()
print(f"Accuracy: {accuracy:.4f}")
print(f"Total time: {end_time - start_time:.2f} seconds")


Total time: 11.05 seconds


In [32]:
confusion_matrix(y_test,y_pred)

array([[1882,  609],
       [1173, 1336]])

In [33]:
# Start timing the training process
start_time = time.time()

# Initialize and train a Random Forest Classifier using the bag-of-words features
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_bow, y_train)

# Predict the labels for the test set and calculate the accuracy score
y_pred = rf.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred)

# End timing the training process and calculate the total time taken
end_time = time.time()
print(f"Total time: {end_time - start_time:.2f} seconds")
print(f"Accuracy: {accuracy:.4f}")


Total time: 619.25 seconds


accuracy_score: 85.2800


In [40]:
# Start timing the training process
start_time = time.time()

# Initialize a CountVectorizer with a maximum of 3000 features
cv = CountVectorizer(max_features=3000)

# Convert the text data into bag-of-words features using the CountVectorizer
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

# Initialize and train a Random Forest Classifier using the bag-of-words features
rf = RandomForestClassifier()
rf.fit(X_train_bow, y_train)

# Predict the labels for the test set and calculate the accuracy score
y_pred = rf.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred)

# End timing the training process and calculate the total time taken
end_time = time.time()
print(f"Total time: {end_time - start_time:.2f} seconds")
print(f"Accuracy: {accuracy:.4f}")


Total time: 45.50 seconds
Accuracy: 0.83


In [43]:
# Start timing the training process
start_time = time.time()

# Initialize a CountVectorizer with ngram_range=(1,2) and a maximum of 10000 features
cv = CountVectorizer(ngram_range=(1,2), max_features=10000)

# Convert the text data into bag-of-words features using the CountVectorizer
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

# Initialize and train a Random Forest Classifier using the bag-of-words features
rf = RandomForestClassifier()
rf.fit(X_train_bow, y_train)

# Predict the labels for the test set and calculate the accuracy score
y_pred = rf.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred)

# End timing the training process and calculate the total time taken
end_time = time.time()
print(f"Total time: {end_time - start_time:.2f} seconds")
print(f"Accuracy: {accuracy:.4f}")


Total time: 137.86 seconds
Accuracy: 0.85


## using tfidf

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TfidfVectorizer object
tfidf = TfidfVectorizer()

# Convert the text data into TF-IDF features using the TfidfVectorizer
X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review'])


In [45]:
# Start timing the training process
start_time = time.time()

# Initialize and train a Random Forest Classifier using the TF-IDF features
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)

# Predict the labels for the test set and calculate the accuracy score
y_pred = rf.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

# End timing the training process and calculate the total time taken
end_time = time.time()
print(f"Total time: {end_time - start_time:.2f} seconds")
print(f"Accuracy: {accuracy:.4f}")


Total time: 598.21 seconds
Accuracy: 0.85
