In [2]:
def data_clean(essay_data):
    
    essay_data.dropna(inplace = True, axis=1)
    essay_data.drop(["rater1_domain1","rater2_domain1","essay_id"],axis=1,inplace = True)
    
    def preprocess_data2feature(X,target_col):
        # Generate feature columns for 15 custom features and a target value column    
        feature_columns = ["essay","word_count","long_word_count","avg_word_length_per_essay","wrong_words","no_of_domain_words","word_to_sent_ratio","num_of_characters","sentence_count","noun_count","verb_count","comma_count","punctuation_count","adjective_count","adverb_count","quotation_mark_count","spelling_mistakes","target"]
        feature_pd = pd.DataFrame(index = X.index, columns = feature_columns)
        feature_pd['essay'] = X['essay']
        feature_pd['target'] = X[target_col]

        return feature_pd
    
    
    def featureSet2(X): 
        # Extract features from the given essay and assign the value/count to the respective column.
        for index,row in X.iterrows():

            text = (row['essay']) 
            text = " ".join(filter(lambda x:x[0]!='@', text.split())) #To remove proper nouns tagged in the data-set which may result into false positives during POS tagging.

            punctuation = ['.','?', '!', ':', ';']
            #Comma count
            comma_count = text.count(',')
            row['comma_count'] = comma_count

            #Punctuation count
            punctuation_count = 0
            for punct in punctuation:
                punctuation_count += text.count(punct)
            row['punctuation_count'] = punctuation_count

            #Quotation marks count
            quotation_mark_count = text.count('"')
            quotation_mark_count += text.count("'")
            row['quotation_mark_count'] = quotation_mark_count

            #Add the sentence count

            tokenized_essay = nltk.sent_tokenize(text)
            sent_count = len(tokenized_essay)
            row['sentence_count'] = sent_count

            #Add word count after removing the stop words.
            words = nltk.word_tokenize(text)
            stop_words = set(stopwords.words('english'))
            stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) # remove it if you need punctuation 

            for word in words:
                if word in stop_words:
                    words.remove(word)
            word_count = len(words)

            row['word_count'] = word_count

            #Long word count
            long_word_count = 0
            total_word_length = 0
            for word in words:
                total_word_length += len(word)
                if len(word) > 6:
                    long_word_count +=1
            row['long_word_count'] = long_word_count

            #Average word length per essay
            row['avg_word_length_per_essay'] = round((total_word_length/float(len(words))),2)


            tool = language_check.LanguageTool('en-US')
            matches = tool.check(text)
            row['spelling_mistakes'] = len(matches)

            #POS TAGS
            count= Counter([j for i,j in nltk.pos_tag(words)])

            row['noun_count'] = count['NN'] + count['NNS'] + count['NNPS'] + count['NNP']
            row['verb_count'] = count['VB'] + count['VBG'] + count['VBP'] + count['VBN'] + count['VBZ']
            row['adjective_count'] = count['JJ'] + count['JJR'] 
            row['adverb_count'] = count['RB'] + count['RBR'] + count['RBS']

            #No_of_domain_words and wrong words after removing the stop words and punctuations from the essay.
            cnt = 0
            wrong_word_count = 0
            for word in words:
                if wn.synsets(word):
                    cnt += 1
                else:
                    wrong_word_count += 1
            row['no_of_domain_words'] = cnt
            row['wrong_words'] = wrong_word_count        

            #Word to sentence ratio
            row['word_to_sent_ratio'] = round(float(word_count/float(sent_count)),2)

            #Number of characters
            row['num_of_characters'] = nltk.FreqDist(text).N()

            #Debugging
            if index%10==0:
                print("made features for rows with index upto ",index)
                

    def GenerateFeatures(X):
        start = time()
        featureSet2(X)
        end = time()
        print("Generated the features for the entire data-set in {:.4f} minutes".format((end - start)/60.0))
        
    #Generate the feature set.
    print (len(X_all))
    GenerateFeatures(X_all[:50])
    GenerateFeatures(X_all[50:100])
    GenerateFeatures(X_all[100:900])
    GenerateFeatures(X_all[900:1700])
    GenerateFeatures(X_all[1700:2500])
    GenerateFeatures(X_all[2500:3300])
    GenerateFeatures(X_all[3300:4100])
    GenerateFeatures(X_all[4100:4900])
    GenerateFeatures(X_all[4900:])
    print ("Done")
    
    X_all.to_csv('model_and_visualization/features_set_1.csv', drop = True)
    y_all.to_csv('model_and_visualization/target_set_1.csv',drop = True)
    
    return X_all, y_all
    

In [None]:
essay_data = pd.read_csv("domain123.csv")
data_clean(essay_data)

In [43]:
def model_building():
    import pandas as pd
    X_all = pd.read_csv("model_and_visualization/features_set_1.csv")
    y_all = pd.read_csv("model_and_visualization/target_set_1.csv")

    X_all = X_all.drop('Unnamed: 0',axis=1)
    y_all = y_all.drop('Unnamed: 0',axis=1)

    X = X_all.drop(["essay"],axis=1)
    y = y_all

    from lightgbm import LGBMRegressor
    lgb = LGBMRegressor()
    lgb.fit(X, y)
    import joblib 
    # Save the model as a pickle in a file 
    joblib.dump(lgb, 'LGBM_Model.pkl') 


In [65]:
model_building()

In [34]:
text = input()

Free example essay on Information Studies: “Developments in Modern technology, means more information, more cheaply and more quickly available than ever before. What are the significance of these changes?”  Indeed, it can be said that “modern technology” has allowed for the cheaper and quicker access of information, and indeed, for the increase of information available. But is it fair to say that technology is a consequence of societal changes. Human beings are most dependant on learning than any species . Information is the basis for communication and co-ordination, that is required for any human society. Due to these developments in “modern technology”, there have been social, economic, political and cultural changes. Although it isn’t clear whether these are positive or negative changes.  There has been a definite increase in the amount of information available, due to the developments in modern technology. For one, the government has little or no control over the information which 

In [39]:

def predict_grade(text,custom_marks):

    import pandas as pd
    import nltk
    from nltk.corpus import stopwords
    import language_check
    from collections import Counter
    from nltk.corpus import wordnet as wn


    text = " ".join(filter(lambda x:x[0]!='@', text.split())) #To remove proper nouns tagged in the data-set which may result into false positives during POS tagging.

    punctuation = ['.','?', '!', ':', ';']
    #Comma count
    comma_count = text.count(',')

    #Punctuation count
    punctuation_count = 0
    for punct in punctuation:
        punctuation_count += text.count(punct)

    #Quotation marks count
    quotation_mark_count = text.count('"')
    quotation_mark_count += text.count("'")

    #Add the sentence count

    tokenized_essay = nltk.sent_tokenize(text)
    sent_count = len(tokenized_essay)

    #Add word count after removing the stop words.
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) # remove it if you need punctuation 

    for word in words:
        if word in stop_words:
            words.remove(word)
    word_count = len(words)

    #Long word count
    long_word_count = 0
    total_word_length = 0
    for word in words:
        total_word_length += len(word)
        if len(word) > 6:
            long_word_count +=1

    #Average word length per essay
    avg_word_length_per_essay =  round((total_word_length/float(len(words))),2)


    tool = language_check.LanguageTool('en-US')
    matches = tool.check(text)
    spelling_mistakes = len(matches)

    #POS TAGS
    count= Counter([j for i,j in nltk.pos_tag(words)])

    noun_count = count['NN'] + count['NNS'] + count['NNPS'] + count['NNP']
    verb_count = count['VB'] + count['VBG'] + count['VBP'] + count['VBN'] + count['VBZ']
    adjective_count = count['JJ'] + count['JJR'] 
    adverb_count = count['RB'] + count['RBR'] + count['RBS']

    #No_of_domain_words and wrong words after removing the stop words and punctuations from the essay.
    cnt = 0
    wrong_word_count = 0
    for word in words:
        if wn.synsets(word):
            cnt += 1
        else:
            wrong_word_count += 1


    #Word to sentence ratio
    word_to_sent_ratio = round(float(word_count/float(sent_count)),2)

    #Number of characters
    num_of_characters = nltk.FreqDist(text).N()


    to_predict =[word_count,long_word_count,avg_word_length_per_essay,wrong_word_count,cnt,
    word_to_sent_ratio,num_of_characters,sent_count,noun_count,verb_count,comma_count,punctuation_count,
    adjective_count,adverb_count,quotation_mark_count,spelling_mistakes]

    # Load the model from the file
    import joblib 
    lgb_from_joblib = joblib.load('LGBM_Model.pkl')  

    import numpy as np
    pred = np.array(to_predict).reshape(1, -1)

    # Use the loaded model to make predictions 
    pred = lgb_from_joblib.predict(pred) 

    pred_score = (pred/12)*custom_marks
    print(pred)
    print(pred_score)

    return pred_score

In [42]:
predict(text,8)

[7.72267251]
[5.14844834]


array([5.14844834])