In [66]:
#Importing required libraries: Regex operations, pandas, numpy, SVC model
import re                  
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.svm import SVC

In [67]:
def add_essay_training(data, essay_set, essay, score):
    if essay_set not in data:
        data[essay_set] = {"essay":[],"score":[]}
    data[essay_set]["essay"].append(essay)
    data[essay_set]["score"].append(score)

def add_essay_test(data, essay_set, essay, prediction_id):
    if essay_set not in data:
        data[essay_set] = {"essay":[], "prediction_id":[]}
    data[essay_set]["essay"].append(essay)
    data[essay_set]["prediction_id"].append(prediction_id)

def read_training_data(training_file):
    f = open(training_file)
    f.readline()

    training_data = {}
    for row in f:
        row = row.strip().split("\t")
        essay_set = row[1]
        essay = row[2]
        domain1_score = int(row[6])
        if essay_set == "2":
            essay_set = "2_1"
        add_essay_training(training_data, essay_set, essay, domain1_score)
        
        if essay_set == "2_1":
            essay_set = "2_2"
            domain2_score = int(row[9])
            add_essay_training(training_data, essay_set, essay, domain2_score)
    
    return training_data

def read_test_data(test_file):
    f = open(test_file)
    f.readline()

    test_data = {}
    for row in f:
        row = row.strip().split("\t")
        essay_set = row[1]
        essay = row[2]
        domain1_predictionid = int(row[3])
        if essay_set == "2": 
            domain2_predictionid = int(row[4])
            add_essay_test(test_data, "2_1", essay, domain1_predictionid)
            add_essay_test(test_data, "2_2", essay, domain2_predictionid)
        else:
            add_essay_test(test_data, essay_set, essay, domain1_predictionid)
    return test_data

def get_character_count(essay):
    return len(essay)

def get_word_count(essay):
    return len(re.findall(r"\s", essay))+1

def extract_features(essays, feature_functions):
    return [[f(es) for f in feature_functions] for es in essays]


In [68]:
print("Reading Training Data")
training = read_training_data("Desktop/hackathon/ASAP-AES/Data/training_set_rel3.tsv")
print("Reading Validation Data")
test = read_test_data("Desktop/hackathon/ASAP-AES/Data/valid_set.tsv")

Reading Training Data
Reading Validation Data


In [69]:
strain = DataFrame.from_csv("Desktop/hackathon/ASAP-AES/Data/training_set_rel3.tsv", sep="\t")
strain.head()

Unnamed: 0_level_0,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,rater1_trait1,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
essay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,,...,,,,,,,,,,
2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,,...,,,,,,,,,,
3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,,...,,,,,,,,,,
4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,,...,,,,,,,,,,
5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,,...,,,,,,,,,,


In [70]:
stest = DataFrame.from_csv("Desktop/hackathon/ASAP-AES/Data/training_set_rel3.tsv", sep="\t")
stest.head()

Unnamed: 0_level_0,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,rater1_trait1,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
essay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,,...,,,,,,,,,,
2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,,...,,,,,,,,,,
3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,,...,,,,,,,,,,
4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,,...,,,,,,,,,,
5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,,...,,,,,,,,,,


In [71]:
feature_functions = [get_character_count, get_word_count]

essay_sets = sorted(training.keys())
predictions = {}

In [72]:
for es_set in essay_sets:
        print("Making Predictions for Essay Set %s" % es_set)
        features = extract_features(training[es_set]["essay"],feature_functions)
        rf = RandomForestRegressor(n_estimators = 100)
        rf.fit(features,training[es_set]["score"])
        features = extract_features(test[es_set]["essay"], feature_functions)
        predicted_scores = rf.predict(features)
        for pred_id, pred_score in zip(test[es_set]["prediction_id"], 
                                       predicted_scores):
            predictions[pred_id] = round(pred_score)
print features

Making Predictions for Essay Set 1
Making Predictions for Essay Set 2_1
Making Predictions for Essay Set 2_2
Making Predictions for Essay Set 3
Making Predictions for Essay Set 4
Making Predictions for Essay Set 5
Making Predictions for Essay Set 6
Making Predictions for Essay Set 7
Making Predictions for Essay Set 8
[[4018, 738], [4671, 867], [3403, 658], [4707, 844], [4355, 860], [3469, 675], [1433, 262], [1960, 389], [2035, 379], [3382, 629], [2167, 404], [3831, 732], [2333, 465], [4562, 849], [2643, 530], [2256, 430], [3465, 627], [4067, 761], [2753, 529], [1664, 318], [1677, 333], [4587, 852], [3813, 719], [4341, 823], [4236, 853], [2702, 499], [3003, 565], [1515, 287], [1828, 344], [1785, 345], [3249, 609], [3217, 649], [4433, 845], [4535, 848], [3463, 727], [2115, 388], [3427, 627], [2705, 585], [4423, 833], [2855, 543], [2209, 420], [4714, 847], [2308, 501], [3409, 704], [3101, 614], [4161, 838], [3061, 627], [4151, 767], [4163, 766], [3419, 677], [4147, 850], [2085, 389], [357

In [73]:
print predicted_scores

[ 40.93  39.47  38.99  37.98  41.98  37.75  34.39  36.39  36.21  40.3   37.2
  42.54  34.42  38.67  37.94  32.6   41.74  38.11  38.09  32.41  33.3
  40.31  40.76  39.71  36.15  37.02  38.92  33.56  37.83  34.67  42.1
  37.51  40.82  36.89  35.58  38.16  38.18  37.04  38.44  36.59  39.22
  39.26  32.19  30.68  35.46  33.79  36.78  36.58  37.16  37.51  33.58
  37.79  38.21  36.39  37.79  39.31  41.68  42.1   38.75  38.11  40.9
  39.54  38.85  42.78  34.72  39.65  43.8   38.04  40.09  35.39  41.49
  33.21  33.06  35.94  37.3   41.06  40.2   40.38  43.98  37.99  36.2
  41.77  40.63  36.49  37.08  35.08  20.82  32.44  39.8   32.35  33.93
  37.11  39.02  30.85  43.8   33.57  35.02  38.99  42.58  42.89  37.89
  37.39  37.67  38.26  33.28  38.79  42.62  40.71  39.9   37.79  41.71
  41.09  41.99  39.61  40.53  35.46  41.99  34.76  37.08  41.79  33.57
  40.01  30.2   34.91  36.03  36.43  31.09  30.1   36.49  32.97  44.73
  38.55  36.57  38.24  37.9   34.21  37.24  35.9   34.57  35.13  35.68
  38

In [75]:
output_file = "length_benchmark_rf.csv"
print("Writing submission to %s" % output_file)
f = open(output_file, "w")
f.write("prediction_id,predicted_score\n")
for key in sorted(predictions.keys()):
   f.write("%d,%d\n" % (key,predictions[key]))
f.close()

Writing submission to length_benchmark_rf.csv


In [76]:
for es_set in essay_sets:
        print("Making Predictions for Essay Set %s" % es_set)
        features = extract_features(training[es_set]["essay"],feature_functions)
        clf = SVC()
        clf.fit(features,training[es_set]["score"]) 
        '''SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
        decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
        max_iter=-1, probability=False, random_state=None, shrinking=True,
        tol=0.001, verbose=False)
        print(clf.predict([[-0.8, -1]]))'''
        features = extract_features(test[es_set]["essay"], feature_functions)
        predicted_scores = clf.predict(features)
        for pred_id, pred_score in zip(test[es_set]["prediction_id"], 
                                       predicted_scores):
            predictions[pred_id] = round(pred_score)
print features

Making Predictions for Essay Set 1
Making Predictions for Essay Set 2_1
Making Predictions for Essay Set 2_2
Making Predictions for Essay Set 3
Making Predictions for Essay Set 4
Making Predictions for Essay Set 5
Making Predictions for Essay Set 6
Making Predictions for Essay Set 7
Making Predictions for Essay Set 8
[[4018, 738], [4671, 867], [3403, 658], [4707, 844], [4355, 860], [3469, 675], [1433, 262], [1960, 389], [2035, 379], [3382, 629], [2167, 404], [3831, 732], [2333, 465], [4562, 849], [2643, 530], [2256, 430], [3465, 627], [4067, 761], [2753, 529], [1664, 318], [1677, 333], [4587, 852], [3813, 719], [4341, 823], [4236, 853], [2702, 499], [3003, 565], [1515, 287], [1828, 344], [1785, 345], [3249, 609], [3217, 649], [4433, 845], [4535, 848], [3463, 727], [2115, 388], [3427, 627], [2705, 585], [4423, 833], [2855, 543], [2209, 420], [4714, 847], [2308, 501], [3409, 704], [3101, 614], [4161, 838], [3061, 627], [4151, 767], [4163, 766], [3419, 677], [4147, 850], [2085, 389], [357

In [77]:
print predicted_scores

[40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40]


In [78]:
output_file = "length_benchmark_svc.csv"
print("Writing submission to %s" % output_file)
f = open(output_file, "w")
f.write("prediction_id,predicted_score\n")
for key in sorted(predictions.keys()):
   f.write("%d,%d\n" % (key,predictions[key]))
f.close()

Writing submission to length_benchmark_svc.csv


In [87]:
#Word_to_vec implementation 

#loding all required libraries
from KaggleWord2VecUtility import KaggleWord2VecUtility
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#loading test and train data
print "loading data..."
if __name__=='__main__':
    train=pd.read_csv('Desktop/hackathon/ASAP-AES/Data/training_set_rel3.tsv',header=0,delimiter='\t',quoting=3)
    test=pd.read_csv('Desktop/hackathon/ASAP-AES/Data/valid_set.tsv',header=0,delimiter='\t',quoting=3)

#word2vec
print "creating word vectors..."

clean_train_reviews=[]
for i in xrange(len(train["review"])):
    clean_train_reviews.append(" ".join(KaggleWord2VecUtility.reviewto_wordlist(train["review"][i],True)))

#create Bag of Words
print "creating a vector..."
vector=TfidfVectorizer(analyzer="word",max_features=50000,sublinear_tf=True,stop_words = 'english',ngram_range=(1, 2), use_idf=1,smooth_idf=1,strip_accents='unicode',min_df=3)

#tokenizing the vectors
print "tokenizing the vector..." 
vector=vector.fit(clean_train_reviews)
train_data=vector.transform(clean_train_reviews)


y=train["sentiment"]

#splitting train data for testing purposes
print "splitting training data for testing purposes..."
X_train,X_test,y_train,y_test=train_test_split(train_data,y,test_size=0.2,random_state=42)


showdown=False
op=True

#showdown(removed Gaussian as performed poorly)
if showdown:
    print "Classifier Tasks"
    classifiers=[
                RandomForestClassifier(n_estimators=150),
                MultinomialNB(alpha=0.0001),
                SGDClassifier(loss='modified_huber',warm_start="True"),
                LogisticRegression(penalty="l2",C=1)
                ]
    count=0
    for clf in classifiers:
        count+=1
        print "training ",count
        clf.fit(X_train,y_train)
        print "testing ",count
        y_pred=clf.predict(X_test)
        print "result ",count,":",accuracy_score(y_test,y_pred)
if op:
    print "training classifier"
    clf=SVC() #performing better than others
    clf.fit(train_data,y)

    print "training complete"

    clean_test_reviews=[]
    print "creating test data"
    for i in xrange(len(test["review"])):
        clean_test_reviews.append(" ".join(KaggleWord2VecUtility.reviewto_wordlist(test["review"][i],True)))
    test_data=vector.transform(clean_test_reviews)

    print "testing..."
    y_pred=clf.predict_proba(test_data)[:,1]
    print "testing complete"
    print "preparing submission file"
    submission=pd.DataFrame(data={"id":test['id'],"sentiment":y_pred})
    submission.to_csv('asap_word_to_vec.csv',quoting=3,index=False)

loading data...
creating word vectors...


KeyError: 'review'