In [3]:
import pandas as pd
import pprint
import pickle
import string
from tabulate import tabulate

import sys
import os
import time

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

In [4]:
full_program_time_1 = time.time()
for i in range(1, 9):
    
    print('Load : File ', i)
    
    #Read the train data
    pkl_file = open('outputs/pkl/preprocessed_data_%s.pkl'%i, 'rb')
    data = pickle.load(pkl_file)
    pkl_file.close()
    
    
    if i >= 6:
        
        #Untokenize the data
        print('Untokenize : File ', i)
        
        count = 0
        for line in data.content:
            data.content[count] = "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in line]).strip()
            count = count + 1
            
    X = data.content
    y = data.sentiment

    vectorizer = TfidfVectorizer(min_df = 5,
                                max_df = 0.8,
                                sublinear_tf = True,
                                use_idf = True)

    #Transform the words to feature vectors
    
    print('Transform : File ', i)
    
    X = vectorizer.fit_transform(X)
    
    
    ############### Random Forest ###############
    
    print('Training : File ', i)
    
    rf_classifier = RandomForestClassifier(n_estimators = 100, oob_score = False)

    rf_classifier_score = cross_val_score(rf_classifier, X, y, cv=10)
    rf_classifier_mean_score = rf_classifier_score.mean()
    
    ############### Random Forest ###############
    
    print('Save : File ', i)
    
    result = [['rf_classifier_result', rf_classifier_score, rf_classifier_mean_score]]
    
    df = pd.DataFrame(result,columns=['Random Forest','Score', 'Mean Score'])
    
    #Persist the model & Vectorizer
    joblib.dump(rf_classifier, 'Backup/RF/3.1_rf_model_cv_%s.pkl'%i)
    joblib.dump(vectorizer, 'Backup/RF/3.1_rf_model_cv_%s_vectorizer.pkl'%i)
    
    #Save the processed data into csv
    df.to_csv('Outputs/csv/RF/3.1_rf_cv_%s_result.csv'%i, index = False)
    
    #Save the file in pickle format
    outputs = open('Outputs/pkl/RF/3.1_rf_cv_%s_result.pkl'%i,'wb')
    pickle.dump(df, outputs)
    outputs.close()
    
full_program_time_2 = time.time()
second = full_program_time_2 - full_program_time_1
minute = second/60
hour = minute/60
print('Minute :', minute)
print('Hour :', hour)

Load : File  1
Transform : File  1
Training : File  1
Save : File  1
Load : File  2
Transform : File  2
Training : File  2
Save : File  2
Load : File  3
Transform : File  3
Training : File  3
Save : File  3
Load : File  4
Transform : File  4
Training : File  4
Save : File  4
Load : File  5
Transform : File  5
Training : File  5
Save : File  5
Load : File  6
Untokenize : File  6
Transform : File  6
Training : File  6
Save : File  6
Load : File  7
Untokenize : File  7
Transform : File  7
Training : File  7
Save : File  7
Load : File  8
Untokenize : File  8
Transform : File  8
Training : File  8
Save : File  8
Minute : 267.3134560783704
Hour : 4.45522426797284
