# Reading the data file using python 

### all the imports

In [110]:
import os
import pandas as pd


## getting all the files from the directory

In [101]:
def get_files():
    all_file_path = []
    """walk through the data dir to its sub-dirs and gets all the file with .review extension and return all the file_paths list"""
    # looping through the directory and the sub-directory
    for root, dirs, files in os.walk("data"):
        for file in files:
            # getting the file end with .review
            if file.endswith(".review"):
                # joining the file path with the actual full path of the file 
                file_path = os.path.join(root, file)
                # appending the file_path in the list 
                all_file_path.append(file_path)
    return all_file_path


## categorized the files into list


In [102]:
def categorise_files(all_file_path):
    """get all file path and categorised the files into positive, negative and unlabeled review and return the file path list as tuple"""
    
    pos_review_f_path = []
    neg_review_f_path=[]
    unl_review_f_path = []
    
    for file in all_file_path:
        if "positive" in file:
            pos_review_f_path.append(file)
        elif "negative" in file:
            neg_review_f_path.append(file)
        else:
            unl_review_f_path.append(file)
    return pos_review_f_path, neg_review_f_path, unl_review_f_path

## reading the files

In [84]:
def read_file(file_path):
    """reads the file from file path and return all lines in list"""
    with open(file_path) as file:
        items = file.readlines()
    return items

## making the list of review after reading the file

In [103]:
def build_review_list(items):
    """gets all the items, prepare a list of each review and returns the list of reviews"""
    reviews = []
    review = []
    for item in items:
        # remove \n from the item
        item = item[:-1]
        # get the file inside the review block not including the start and the end block
        if item != '<review>' and item != '</review':
            review.append(item)
            
        #  adding the review to the review list  
        if item == '</review>':
            reviews.append(review)
            review = []
    return reviews
   

In [88]:

# we need to run read_file methods to all of the file to get the data 
# list to store positive review
# all_positive_reviews = []
# for file in positive_review_file_list:
#     all_positive_reviews.append(read_file(file))
# len(reviews)

## processing the review and making it ready to transfer into the dataframe

In [104]:
def process_review_list(list_of_reviews):
    """gets the list of reviews, seperate the headings and values and returns as list of dictionary"""
    dict_review_list = []
    previous_key = None
    
    #iterating through reviews
    for review in (list_of_reviews):
        dict_review = {}
        #iterating through a single review
        for num, item in enumerate(review):
            #eliminating null values
            if item != "":
                #getting and heading and assigning with corresponding values
                if item[0] =="<" and item[1] != "/":
                    dict_review[item] = review[num + 1]
                    previous_key = item
                #checking if any values are missing and reassigning them
                if item not in dict_review.values() and item[0] != '<':
                    dict_review[previous_key] += item
        # when a review is ready adding it to the list
        dict_review_list.append(dict_review)
    return dict_review_list

### converting the list of the dict into the datarame to convert into the csv file or process straightly


In [105]:
# creating the data from list of dict using lambda
to_dataframe = lambda list_of_dict: pd.DataFrame(list_of_dict)

### converting the dataframe into the csv file and saving into the directory


In [106]:
# convert and save to csv file
save_to_csv = lambda df, file_name: df.to_csv(file_name, index=False, encoding='utf-8')

### combining all the process to get single list of review of each category file found in directory

In [107]:

def convert_to_csv():
    dir = os.path.abspath(os.getcwd()) + "/data/csv_files/"
    
    file_name = ('positive_review', 'negative_review', 'unlabled_review')

    # get all the files
    all_file_path = get_files()
    
    #categorised the files in the list
    categories = categorise_files(all_file_path)
    
    #read all the files from each category:
    for num, category in enumerate(categories):
        
        # hold all the items of each category in one list 
        items_holder = []
        
        for file_path in category:
            # getting all the items
            items = read_file(file_path)
            items_holder.extend(items)
        
        # build the list of the items in one category
        list_of_reviews = build_review_list(items_holder)
        
        # buid of list of processed review in dictionaries
        list_of_dict_reviews = process_review_list(list_of_reviews)
        
        # build of dataframe from processed_list
        data_frame = to_dataframe(list_of_dict_reviews)
        
        # save the dataframe into the csv file
        save_to_csv(data_frame, file_name=(dir + file_name[num]))
        
            

In [111]:
# convert to csv
convert_to_csv()