In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('averaged_perceptron_tagger')
from sklearn.metrics import mean_squared_error

In [2]:
import pandas as pd
import syllables

data = pd.read_csv("Output_New.csv", header=None)
data.columns = ['sentence_id', 'word_id', 'word', 'start_time', 'end_time', 'offset']
data['length'] = data.apply (lambda row: len(row["word"]), axis=1)
data['syllable_count'] = data.apply (lambda row: syllables.estimate(row["word"]), axis=1)
xtrain = np.array(data.length.values.astype(np.float32)).reshape(-1,1)
ycorrect = np.array(data.offset.values.astype(np.float32)).reshape(-1,1)
data.head()


Unnamed: 0,sentence_id,word_id,word,start_time,end_time,offset,length,syllable_count
0,0,0,Mister,0.03,0.08,0.05,6,2
1,0,1,quilter,0.08,1.02,0.94,7,2
2,0,2,is,1.02,1.04,0.02,2,1
3,0,3,the,1.04,1.05,0.01,3,1
4,0,4,Apostle,1.05,2.01,0.96,7,2


In [3]:
data['prev_word_1_length'] = data.length.shift()
data['prev_word_2_length'] = data.length.shift(2)
data['prev_word_3_length'] = data.length.shift(3)
data['prev_word_4_length'] = data.length.shift(4)
data['next_word_1_length'] = data.length.shift(-1)
data['next_word_2_length'] = data.length.shift(-2)
data['next_word_3_length'] = data.length.shift(-3)
data['next_word_4_length'] = data.length.shift(-4)
data.head()

Unnamed: 0,sentence_id,word_id,word,start_time,end_time,offset,length,syllable_count,prev_word_1_length,prev_word_2_length,prev_word_3_length,prev_word_4_length,next_word_1_length,next_word_2_length,next_word_3_length,next_word_4_length
0,0,0,Mister,0.03,0.08,0.05,6,2,,,,,7.0,2.0,3.0,7.0
1,0,1,quilter,0.08,1.02,0.94,7,2,6.0,,,,2.0,3.0,7.0,2.0
2,0,2,is,1.02,1.04,0.02,2,1,7.0,6.0,,,3.0,7.0,2.0,3.0
3,0,3,the,1.04,1.05,0.01,3,1,2.0,7.0,6.0,,7.0,2.0,3.0,6.0
4,0,4,Apostle,1.05,2.01,0.96,7,2,3.0,2.0,7.0,6.0,2.0,3.0,6.0,7.0


In [4]:
# These comments required for new input csv file!
# data['tmp_offset'] = -1 * (data.start_time - data.start_time.shift(-1))
# data['offset'] = data.apply(lambda row: row['tmp_offset'] if row['tmp_offset'] >= 0 else row['end_time'] - row['start_time'], axis=1)
data['sentence_length'] = data.sentence_id.map(data.groupby('sentence_id').count().word_id)

In [5]:
for i in range(len(data)):
    for j in range(4):
        data.iloc[i, 8+j] = 0 if data.word_id[i] <= j else data.iloc[i, 8+j]
#         better version but showing some error! Need to debug it!
#         data['prev_word_{k}_length'.format(k=j+1)][i] = 0 if data.word_id[i] <= j else data['prev_word_{k}_length'.format(k=j+1)][i]



In [6]:
for i in range(len(data)):
    for j in range(4):
        data.iloc[i, 12+j] = 0 if (data.sentence_length[i] - data.word_id[i] - 1) <= j else data.iloc[i, 12+j]
#         better version but showing some error! Need to debug it!
#         data['next_word_{k}_length'.format(k=j+1)][i] = 0 if (data.sentence_length[i] - data.word_id[i] -1) <= j else data['next_word_{k}_length'.format(k=j+1)][i]


In [9]:
data['part_of_speech'] = data.apply (lambda row: nltk.pos_tag(row["word"])[0][1] , axis=1)
data['is_noun'] = data.apply (lambda row: 1 if nltk.pos_tag(row["word"])[0][1] in ['NN','NNS','NNP','NNPS'] else 0, axis=1)
data['is_adjective'] = data.apply (lambda row: 1 if nltk.pos_tag(row["word"])[0][1] in ['JJ','JJR','JJS'] else 0, axis=1)
data['is_pronoun'] = data.apply (lambda row: 1 if nltk.pos_tag(row["word"])[0][1] in ['PRP','PRP$','WP','WP$'] else 0, axis=1)
data['is_adverb'] = data.apply (lambda row: 1 if nltk.pos_tag(row["word"])[0][1] in ['RB','RBR','RBS','WRB'] else 0, axis=1)
data['is_verb'] = data.apply (lambda row: 1 if nltk.pos_tag(row["word"])[0][1] in ['VB','VBD','VBG','VBN','VBP','VBZ'] else 0, axis=1)
data['is_determiner'] = data.apply (lambda row: 1 if nltk.pos_tag(row["word"])[0][1] in ['DT','WDT'] else 0, axis=1)
data['is_foreign_word'] = data.apply (lambda row: 1 if nltk.pos_tag(row["word"])[0][1] == 'FW' else 0, axis=1)
data['is_conjunction'] = data.apply (lambda row: 1 if nltk.pos_tag(row["word"])[0][1] == 'CC' else 0, axis=1)
data['is_preposition'] = data.apply (lambda row: 1 if nltk.pos_tag(row["word"])[0][1] == 'IN' else 0, axis=1)


In [10]:
data.columns

Index(['sentence_id', 'word_id', 'word', 'start_time', 'end_time', 'offset',
       'length', 'syllable_count', 'prev_word_1_length', 'prev_word_2_length',
       'prev_word_3_length', 'prev_word_4_length', 'next_word_1_length',
       'next_word_2_length', 'next_word_3_length', 'next_word_4_length',
       'sentence_length', 'part_of_speech', 'is_noun', 'is_adjective',
       'is_pronoun', 'is_adverb', 'is_verb', 'is_determiner',
       'is_foreign_word', 'is_conjunction', 'is_preposition'],
      dtype='object')

In [22]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2)

In [34]:
training_columns = ['length', 'syllable_count',
                        'prev_word_1_length','prev_word_2_length','prev_word_3_length','prev_word_4_length',
                        'next_word_1_length','next_word_2_length','next_word_3_length','next_word_4_length',
                        'sentence_length','is_noun', 'is_adjective','is_pronoun', 'is_adverb', 'is_verb', 
                        'is_determiner','is_foreign_word', 'is_conjunction', 'is_preposition']
train_x = train[training_columns]
train_y = train.iloc[:,5]

test_x = test[training_columns]
test_y = test.iloc[:,5]


In [36]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)
regr.fit(train_x, train_y)
print(regr.feature_importances_)

[0.8540603 0.        0.1459397 0.        0.        0.        0.
 0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.        0.        0.        0.       ]


In [37]:
hyp = regr.predict(test_x)

In [38]:
from sklearn.metrics import mean_squared_error
score_for_test_set = mean_squared_error(hyp, test_y)
score_for_test_set

0.17652044248922033