In [10]:
import os
import glob

import numpy as np

import pandas as pd
pd.set_option('display.max_colwidth', 50)

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.filterwarnings('ignore')

In [3]:
%config InlineBackend.figure_format='retina'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Exploratory Data Analysis

### Reading in the positive and negative reviews


In [11]:
pos_file_path = 'data/pos/*.txt'  # Adjust this path to your specific directory
neg_file_path = 'data/neg/*.txt'

def get_data(file_path):
    files = glob.glob(file_path)
    df = pd.DataFrame(columns=['FileName', 'Content'])

    for file in files:
        # Extracting file name
        file_name = os.path.basename(file)
        # Reading file content
        with open(file, 'r', encoding='utf-8') as file_open:
            content = file_open.read()

        df = df.append({'FileName': file_name, 'Content': content}, ignore_index=True)
    return df

In [18]:
#get positive df
pos_unclean_df = get_data(pos_file_path)
neg_unclean_df = get_data(neg_file_path)

In [19]:
pos_unclean_df.head()

Unnamed: 0,FileName,Content
0,20935_9.txt,"I just watched ""return from lonesome dove"" and..."
1,12390_8.txt,This movie looked like a classic in the cheesy...
2,9820_8.txt,Jay Chou plays an orphan raised in a kung fu s...
3,883_7.txt,"Ooverall, the movie was fairly good, a good ac..."
4,9063_8.txt,"This movie is fun to watch. If you liked ""Dave..."


In [20]:
neg_unclean_df.head()

Unnamed: 0,FileName,Content
0,3314_4.txt,Stan Laurel and Oliver Hardy are the most famo...
1,17112_1.txt,"I saw this movie a few years ago, and man I ne..."
2,3008_1.txt,I watched this film few times and all i can sa...
3,17951_1.txt,From reading the back of the box my first thou...
4,9487_1.txt,This is a film of immense appeal to a relative...


### Getting out the rating from the txt file

In [29]:
pos_unclean_df['rating'] = pos_unclean_df['FileName'].apply(lambda x: int(x.split('_')[1][0]))
pos_unclean_df.head()

Unnamed: 0,FileName,Content,rating
0,20935_9.txt,"I just watched ""return from lonesome dove"" and...",9
1,12390_8.txt,This movie looked like a classic in the cheesy...,8
2,9820_8.txt,Jay Chou plays an orphan raised in a kung fu s...,8
3,883_7.txt,"Ooverall, the movie was fairly good, a good ac...",7
4,9063_8.txt,"This movie is fun to watch. If you liked ""Dave...",8


In [30]:
neg_unclean_df['rating'] = neg_unclean_df['FileName'].apply(lambda x: int(x.split('_')[1][0]))
neg_unclean_df.head()

Unnamed: 0,FileName,Content,rating
0,3314_4.txt,Stan Laurel and Oliver Hardy are the most famo...,4
1,17112_1.txt,"I saw this movie a few years ago, and man I ne...",1
2,3008_1.txt,I watched this film few times and all i can sa...,1
3,17951_1.txt,From reading the back of the box my first thou...,1
4,9487_1.txt,This is a film of immense appeal to a relative...,1
