## Importing and downloading Libraries

In [1]:
import os
import glob
import pandas as pd
from numpy import *

import warnings
warnings.filterwarnings('ignore')

  from pandas.core import (


## Loading in dataset

In [2]:
pos_file_path = 'data/Raw/pos/*.txt'  
neg_file_path = 'data/Raw/neg/*.txt'

def get_data(file_path):
    files = glob.glob(file_path)
    df = pd.DataFrame(columns=['FileName', 'Content'])
    for file in files:
        # Extract file name
        file_name = os.path.basename(file)
        # Read in file content
        with open(file, 'r', encoding='utf-8') as file_open:
            content = file_open.read()
        file_to_add = pd.DataFrame({'FileName': [file_name], 'Content': [content]})
        df = pd.concat([df, file_to_add], ignore_index=True)
    return df

In [3]:
pos_unclean_df = get_data(pos_file_path)
neg_unclean_df = get_data(neg_file_path)

In [4]:
pos_unclean_df.head()

Unnamed: 0,FileName,Content
0,20935_9.txt,"I just watched ""return from lonesome dove"" and..."
1,12390_8.txt,This movie looked like a classic in the cheesy...
2,9820_8.txt,Jay Chou plays an orphan raised in a kung fu s...
3,883_7.txt,"Ooverall, the movie was fairly good, a good ac..."
4,9063_8.txt,"This movie is fun to watch. If you liked ""Dave..."


In [5]:
neg_unclean_df.head()

Unnamed: 0,FileName,Content
0,3314_4.txt,Stan Laurel and Oliver Hardy are the most famo...
1,17112_1.txt,"I saw this movie a few years ago, and man I ne..."
2,3008_1.txt,I watched this film few times and all i can sa...
3,17951_1.txt,From reading the back of the box my first thou...
4,9487_1.txt,This is a film of immense appeal to a relative...


In [6]:
def extract_rating_from_filename(filename):
    parts = filename.split("_")
    rating = parts[1].split(".")[0]
    return int(rating)

In [7]:
pos_unclean_df['rating'] = pos_unclean_df['FileName'].apply(lambda z: extract_rating_from_filename(z))
pos_unclean_df.head()

Unnamed: 0,FileName,Content,rating
0,20935_9.txt,"I just watched ""return from lonesome dove"" and...",9
1,12390_8.txt,This movie looked like a classic in the cheesy...,8
2,9820_8.txt,Jay Chou plays an orphan raised in a kung fu s...,8
3,883_7.txt,"Ooverall, the movie was fairly good, a good ac...",7
4,9063_8.txt,"This movie is fun to watch. If you liked ""Dave...",8


In [8]:
#Check unique positive ratings
pos_ratings = pos_unclean_df['rating'].unique()
print(pos_ratings)

[ 9  8  7 10]


In [9]:
neg_unclean_df['rating'] = neg_unclean_df['FileName'].apply(lambda z: extract_rating_from_filename(z))
neg_unclean_df.head()

Unnamed: 0,FileName,Content,rating
0,3314_4.txt,Stan Laurel and Oliver Hardy are the most famo...,4
1,17112_1.txt,"I saw this movie a few years ago, and man I ne...",1
2,3008_1.txt,I watched this film few times and all i can sa...,1
3,17951_1.txt,From reading the back of the box my first thou...,1
4,9487_1.txt,This is a film of immense appeal to a relative...,1


In [10]:
#Check unique negative ratings
neg_ratings = neg_unclean_df['rating'].unique()
print(neg_ratings)

[4 1 3 2]


In [11]:
pos_unclean_df.head(10)

Unnamed: 0,FileName,Content,rating
0,20935_9.txt,"I just watched ""return from lonesome dove"" and...",9
1,12390_8.txt,This movie looked like a classic in the cheesy...,8
2,9820_8.txt,Jay Chou plays an orphan raised in a kung fu s...,8
3,883_7.txt,"Ooverall, the movie was fairly good, a good ac...",7
4,9063_8.txt,"This movie is fun to watch. If you liked ""Dave...",8
5,10186_10.txt,I loved this movie. In fact I loved being an a...,10
6,22540_10.txt,Andaz Apna Apna is my favorite comedy movie of...,10
7,147_10.txt,"In the very first episode of Friends, which ai...",10
8,3092_10.txt,"This, like Murder She Wrote, is one of those s...",10
9,15251_8.txt,Pretty good movie about a man and his wife who...,8


In [12]:
neg_unclean_df.head(10)

Unnamed: 0,FileName,Content,rating
0,3314_4.txt,Stan Laurel and Oliver Hardy are the most famo...,4
1,17112_1.txt,"I saw this movie a few years ago, and man I ne...",1
2,3008_1.txt,I watched this film few times and all i can sa...,1
3,17951_1.txt,From reading the back of the box my first thou...,1
4,9487_1.txt,This is a film of immense appeal to a relative...,1
5,4518_1.txt,This is one of those movies you see in the vid...,1
6,4604_4.txt,I admit the problem I have with the much-celeb...,4
7,5743_3.txt,This movie is not as good as all think. the ac...,3
8,9056_1.txt,"I just sat in the theater bored as hell, i wan...",1
9,23687_3.txt,"This early film from director Bob Clark (""Pork...",3


## Convert Files to CSVs

In [13]:
pos_unclean_df.to_csv("pos.csv")
neg_unclean_df.to_csv("neg.csv")