#### This is python code file to load all the text files from train and test folder, read text from each file and save it to one csv file for futher data processing

In [1]:
import pandas as pd
import numpy as np
import re
import glob

In [2]:
def load_data_label(path):
    
    """
    Function takes path the folder containing text files 
    and returns list containing text from each text file. 
    It also check for directory and creates labels to return list.
    Labels sentiment as positive and negative
    
    """
    features = []
    labels = []
    
    files = glob.glob(path + '*.txt')
    
    for file in files:
        
        f = open(file)
        text = f.read()
        features.append(text)
        
        if 'pos' in file:
            
            labels.append('positive')
        else:
            labels.append('negative')
        f.close()
            
    return features,labels

# Load , Process and Save Train data

In [3]:
pos_feat , pos_lb = load_data_label('/Users/manishatakale/Downloads/SentimentAnalysis/Proj1_IMDB_Data/Data/IMDB/train/pos/')

In [4]:
neg_feat , neg_lb = load_data_label('/Users/manishatakale/Downloads/SentimentAnalysis/Proj1_IMDB_Data/Data/IMDB/train/neg/')

In [5]:
pos_review = pd.DataFrame()
neg_review = pd.DataFrame()

In [6]:
pos_review['reviews'] = pos_feat
pos_review['sentiments'] = pos_lb

In [7]:
neg_review['reviews'] = neg_feat
neg_review['sentiments'] = neg_lb

In [8]:
pos_review.head()

Unnamed: 0,reviews,sentiments
0,For a movie that gets no respect there sure ar...,positive
1,Bizarre horror movie filled with famous faces ...,positive
2,"A solid, if unremarkable film. Matthau, as Ein...",positive
3,It's a strange feeling to sit alone in a theat...,positive
4,"You probably all already know this by now, but...",positive


In [9]:
neg_review.head()

Unnamed: 0,reviews,sentiments
0,Working with one of the best Shakespeare sourc...,negative
1,"Well...tremors I, the original started off in ...",negative
2,Ouch! This one was a bit painful to sit throug...,negative
3,"I've seen some crappy movies in my life, but t...",negative
4,"""Carriers"" follows the exploits of two guys an...",negative


In [10]:
train_data = pd.concat([pos_review, neg_review], axis=0)

In [11]:
train_data.head()

Unnamed: 0,reviews,sentiments
0,For a movie that gets no respect there sure ar...,positive
1,Bizarre horror movie filled with famous faces ...,positive
2,"A solid, if unremarkable film. Matthau, as Ein...",positive
3,It's a strange feeling to sit alone in a theat...,positive
4,"You probably all already know this by now, but...",positive


In [12]:
train_data.tail()

Unnamed: 0,reviews,sentiments
12495,"My comments may be a bit of a spoiler, for wha...",negative
12496,"The ""saucy"" misadventures of four au pairs who...",negative
12497,"Oh, those Italians! Assuming that movies about...",negative
12498,Eight academy nominations? It's beyond belief....,negative
12499,"Not that I dislike childrens movies, but this ...",negative


In [13]:
train_data.isna().sum()

reviews       0
sentiments    0
dtype: int64

In [14]:
train_data.to_csv('/Users/manishatakale/Downloads/SentimentAnalysis/Proj1_IMDB_Data/Data/IMDB/train_data.csv', 
                  sep='\t',encoding='utf-8', index=False)

# Load , Process and Save Test Data

In [15]:
pos_feat_test , pos_lb_test = load_data_label('/Users/manishatakale/Downloads/SentimentAnalysis/Proj1_IMDB_Data/Data/IMDB/test/pos/')

In [16]:
neg_feat_test , neg_lb_test = load_data_label('/Users/manishatakale/Downloads/SentimentAnalysis/Proj1_IMDB_Data/Data/IMDB/test/neg/')

In [17]:
pos_review_test = pd.DataFrame()
neg_review_test = pd.DataFrame()

In [18]:
pos_review_test['reviews'] = pos_feat_test
pos_review_test['sentiments'] = pos_lb_test

In [19]:
neg_review_test['reviews'] = neg_feat_test
neg_review_test['sentiments'] =  neg_lb_test

In [20]:
test_data = pd.concat([pos_review_test, neg_review_test], axis=0)

In [21]:
test_data.to_csv('/Users/manishatakale/Downloads/SentimentAnalysis/Proj1_IMDB_Data/Data/IMDB/test_data.csv', 
                 sep='\t', encoding='utf-8', index=False)