In [None]:
import numpy as np                                                              #Used for numerical computations 
import pandas as pd                                                             #Used for reading the data
import matplotlib.pyplot as plt                                                 #Used for plotting 
from nltk.corpus import stopwords                                               #This is used to plot the number of stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize                          #This is used to divide the overall text data to tokens and sentences
import tqdm                                                                     #Used for measuring the time it takes to get the things done 
import re                                                                       #Standard library for reading and substituting the word expressions 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm
import nltk                                                                     #Used for the natural language processing tasks 
from wordcloud import WordCloud                                                 #It is used to plot the frequency of the words which determines their size

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
##Reading the training data, testing data and sample values that we are going to be understanding and using in the long term. 

df_train = pd.read_csv('/content/drive/MyDrive/mp2_dataset/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/mp2_dataset/test.csv')

In [None]:
import nltk
nltk.download('stopwords')


'''
WordNet is a lexical database for the English language, which was created by Princeton, and is part of the NLTK corpus.
You can use WordNet alongside the NLTK module to find the meanings of words, synonyms, antonyms, and more.
'''
nltk.download('wordnet')


'''
Punkt Sentence Tokenizer

This tokenizer divides a text into a list of sentences by using an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences.
It must be trained on a large collection of plaintext in the target language before it can be used.

The NLTK data package includes a pre-trained Punkt tokenizer for English.
'''
nltk.download('punkt')



nltk.download('omw-1.4')        #Open Multilingual Wordnet

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
def preprocessing_function(df):
    """
    This function takes into consideration the dataframe and extracts the text.
    In addition, it makes modifications to the text and converts it to a simpler form
    for machine learning processing respectively."""
    
    text_list = []
    for text in tqdm(df['excerpt'].values):
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = text.lower()
        text = nltk.word_tokenize(text)
        [word for word in text if not word in set(stopwords.words("english"))]
        lemmatizer = nltk.WordNetLemmatizer()
        text = [lemmatizer.lemmatize(word) for word in text]
        text = " ".join(text)
        text_list.append(text)
    text_list = pd.Series(text_list)
    text_list.column = ['Converted_text']
    return text_list

In [None]:
preprocessed_text = preprocessing_function(df_train)
preprocessed_text_test = preprocessing_function(df_test)

100%|██████████| 2834/2834 [01:07<00:00, 41.90it/s]
100%|██████████| 7/7 [00:00<00:00, 47.80it/s]


In [None]:
print(preprocessed_text)

0       when the young people returned to the ballroom...
1       all through dinner time mr fayre wa somewhat s...
2       a roger had predicted the snow departed a quic...
3       and outside before the palace a great garden w...
4       once upon a time there were three bear who liv...
                              ...                        
2829    when you think of dinosaur and where they live...
2830    so what is a solid solid are usually hard beca...
2831    the second state of matter we will discus is a...
2832    solid are shape that you can actually touch th...
2833    animal are made of many cell they eat thing an...
Length: 2834, dtype: object


**Defining get_useful_features functions**


It is now time to get the useful features that are important for machine learning. We would have to be creating new features that would help the machine learning models to get the best predictions for the difficulty of the text.

Taking into consideration the excerpt and stopwords, we are going to be creating new feautres such as total number of words, sentence length, overall change in the text length and other features that are important for getting the machine learning outputs. The function would return the final dataframe that contains all the preprocessed output along with the newly created features that are important for machine learning.

In [None]:
def get_useful_features(df, stop_words):
    """
    The function would take the dataframe and stopwords and then, convert the excerpts into different features
    such as the number of sentences, words and the lenght of the lemmas created along with the overall preprocessed
    essay length."""
    sentences = []
    num_of_words = []
    sent_length = []
    word_length = []
    lemma_length = []
    num_of_lemmas = []
    preprocessed_essay_length = []
    initial_text_length = []
    num_of_sentences = []
    text_shortage = []
    
    for text in tqdm(df['excerpt'].values):
        
        initial_length = len(text)
        initial_text_length.append(initial_length)
        num_sentences = len(sent_tokenize(text))
        num_of_sentences.append(num_sentences)
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = text.lower()
        text = word_tokenize(text)
        num_words = len(text) 
        num_of_words.append(num_words)
        sent_length.append(num_words/num_sentences)
        word_length.append(initial_length/num_words)
        text = [word for word in text if not word in stop_words]
        lemmatizer = nltk.WordNetLemmatizer()
        text = [lemmatizer.lemmatize(word) for word in text]
        #print(text)
        num_lemmas = len(text)
        num_of_lemmas.append(num_lemmas)
        text = " ".join(text)
        #print(text)
        preprocessed_essay_length_value = len(text)
        preprocessed_essay_length.append(preprocessed_essay_length_value)
        #print(preprocessed_essay_length)
        #print(num_lemmas)
        lemma_length.append(preprocessed_essay_length_value/num_lemmas)
        
        text_shortage.append(preprocessed_essay_length_value/initial_length)
        
    final_df = pd.concat([pd.Series(sent_length), pd.Series(num_of_words),
                             pd.Series(word_length), pd.Series(lemma_length),
                             pd.Series(num_of_sentences), pd.Series(initial_text_length),
                             pd.Series(num_of_lemmas), pd.Series(preprocessed_essay_length),
                             pd.Series(text_shortage)], axis = 1)
    final_df.columns = ["sentence_length", "num_of_words", "word_length",
                           "lemma_length", "num_of_sentences",
                           "initial_text_length", "num_of_lemmas",
                           "preprocessed_essay_length", "text_shortage"]
    
    return final_df
        

In [None]:
final_df = get_useful_features(df_train, stop_words = set(stopwords.words("english")))

100%|██████████| 2834/2834 [00:05<00:00, 560.27it/s]


In [None]:
final_df_test = get_useful_features(df_test, stop_words = set(stopwords.words("english")))

100%|██████████| 7/7 [00:00<00:00, 495.16it/s]


In [None]:
print(final_df.head())

   sentence_length  num_of_words  word_length  lemma_length  num_of_sentences  \
0        16.454545           181     5.480663      6.494505                11   
1        11.466667           172     5.447674      6.482353                15   
2        15.636364           172     5.279070      6.273810                11   
3        33.400000           167     5.443114      6.095745                 5   
4        30.200000           151     4.788079      5.581081                 5   

   initial_text_length  num_of_lemmas  preprocessed_essay_length  \
0                  992             91                        591   
1                  937             85                        551   
2                  908             84                        527   
3                  909             94                        573   
4                  723             74                        413   

   text_shortage  
0       0.595766  
1       0.588047  
2       0.580396  
3       0.630363  
4       0

## 3.7 Creating new function that generates more features

We are going to create a function that would create more features such as counting the number of commas, semicolons and other important features that are important for machine learning analysis. We would have to create empty lists of these values and we are going to concat those by converting them into series and then, return a new dataframe respectively. 

With the help of this function, we have created new dataframe which contains the useful columns that are important for machine learning respectively. We would be performing the feature analysis and thise ensures that we are going to be getting the best results on the test set respectively. 

At last, we are going to concat those values that are important for machine learning and this would ensure that we get the best results in the test set respectively. We are going to be taking those values and this ensures that we are getting the best results on the test set. 

In [None]:
def generate_more_features(df: pd.DataFrame):
    """
    This function would create a dataframe of different useful features
    that are important for machine learning predictions respectively.
    """
    commas = []
    semicolon = []
    exclamations = []
    questions = []
    quotes = []
    periods = []
    longest_word = []
    
    for i in range(len(df)):
        
        #word_len = []
        text = df['excerpt'].iloc[i]
        commas.append(text.count(","))
        semicolon.append(text.count(";"))
        exclamations.append(text.count("!"))
        questions.append(text.count("?"))
        quotes.append(text.count('"'))
        periods.append(text.count('.'))
        word_len = [len(w) for w in text.split(" ")]
        longest_word.append(np.max(word_len))
        
    df_with_features =pd.concat((pd.Series(commas), pd.Series(semicolon), pd.Series(exclamations),
                               pd.Series(questions), pd.Series(quotes), pd.Series(periods),
                                pd.Series(longest_word)), axis = 1)
    df_with_features.columns = ["num_of_commas", "num_of_semicolons", "num_of_explamations",
                                "num_of_questions", "num_of_quotes", "num_of_periods", 
                                "longest_word"]
                                
    return df_with_features

In [None]:
df_with_more_features = generate_more_features(df_train)

In [None]:
df_with_more_features_test = generate_more_features(df_test)

In [None]:
print(df_with_more_features.head())

   num_of_commas  num_of_semicolons  num_of_explamations  num_of_questions  \
0             14                  0                    0                 0   
1             24                  0                    5                 2   
2             17                  2                    1                 0   
3             23                  2                    0                 0   
4             13                 10                    0                 0   

   num_of_quotes  num_of_periods  longest_word  
0              0              11            14  
1             12              10            15  
2             10              11            14  
3              0               5            13  
4              0               5            12  


In [None]:
df_complete = pd.concat((preprocessed_text, df_with_more_features, final_df), axis = 1)

In [None]:
df_complete_test = pd.concat((preprocessed_text_test, df_with_more_features_test, final_df_test), axis = 1)

In [None]:
print(df_complete)

                                                      0  num_of_commas  \
0     when the young people returned to the ballroom...             14   
1     all through dinner time mr fayre wa somewhat s...             24   
2     a roger had predicted the snow departed a quic...             17   
3     and outside before the palace a great garden w...             23   
4     once upon a time there were three bear who liv...             13   
...                                                 ...            ...   
2829  when you think of dinosaur and where they live...             12   
2830  so what is a solid solid are usually hard beca...              5   
2831  the second state of matter we will discus is a...              2   
2832  solid are shape that you can actually touch th...              8   
2833  animal are made of many cell they eat thing an...             23   

      num_of_semicolons  num_of_explamations  num_of_questions  num_of_quotes  \
0                     0       

In [None]:
df_complete_important_features = df_complete.iloc[:, 1:]
df_complete_important_features_test = df_complete_test.iloc[:, 1:]

In [None]:
print(df_complete_important_features)

      num_of_commas  num_of_semicolons  num_of_explamations  num_of_questions  \
0                14                  0                    0                 0   
1                24                  0                    5                 2   
2                17                  2                    1                 0   
3                23                  2                    0                 0   
4                13                 10                    0                 0   
...             ...                ...                  ...               ...   
2829             12                  0                    0                 3   
2830              5                  0                    0                 1   
2831              2                  1                    0                 1   
2832              8                  1                    0                 0   
2833             23                  1                    0                 0   

      num_of_quotes  num_of

**Feature Scaling**

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
def min_max_scaler(df_train, df_test):
    """
    This function performs the scaling operation by taking into account the train and test set respectively. It is going to 
    convert the values that are present in the data to lie between 0 and 1 respectively.
    """
    scaler = MinMaxScaler()
    scaler.fit(df_train)
    df_scaled = scaler.transform(df_train)
    df_scaled_test = scaler.transform(df_test)
    return df_scaled, df_scaled_test

In [None]:
df_scaled, df_scaled_test = min_max_scaler(df_complete_important_features, df_complete_important_features_test)

In [None]:
#Splitting the dataset into training and testing sets
x = df_scaled
y = df_train['target'].values
x_train, x_cv, y_train, y_cv = train_test_split(x, y, test_size = 0.3, random_state = 50)

**Training our dataset over various models**

In [None]:
#SGDRegreesor
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
sgd_reg = SGDRegressor(random_state=42)
sgd_reg.fit(x_train,y_train)

SGDRegressor(random_state=42)

In [None]:
#MSE of SGDRegreesor
y_pred = sgd_reg.predict(x_cv)
print(mean_squared_error(y_cv,y_pred))

0.7153011475355732


In [None]:
#RandomForestRegressor and it's MSE
from sklearn.ensemble import RandomForestRegressor
rnd_reg = RandomForestRegressor(n_estimators=100,random_state=42)
rnd_reg.fit(x_train,y_train)
y_pred = sgd_reg.predict(x_cv)
print(mean_squared_error(y_cv,y_pred))

0.7153011475355732


In [None]:
#GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()
model.fit(x_train, y_train)


GradientBoostingRegressor()

In [None]:
#Prediction
print("Actual value",y_cv[9])
print("Predicted value",model.predict([x_cv[9,:]]))

Actual value -0.06499875
Predicted value [-0.75157599]


In [None]:
#MSE of GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
y_pred = model.predict(x_cv)
print(mean_squared_error(y_cv,y_pred))

0.6415022648606243


In [None]:
!pip install flask
!pip install pyngrok==4.1.1
!pip install flask_ngrok

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyngrok==4.1.1
  Downloading pyngrok-4.1.1.tar.gz (18 kB)
Building wheels for collected packages: pyngrok
  Building wheel for pyngrok (setup.py) ... [?25l[?25hdone
  Created wheel for pyngrok: filename=pyngrok-4.1.1-py3-none-any.whl size=15982 sha256=1ef2114f25214aeaeb74482d2b137f1151c6b3bfaa9770d70a1badebab45e1c0
  Stored in directory: /root/.cache/pip/wheels/b1/d9/12/045a042fee3127dc40ba6f5df2798aa2df38c414bf533ca765
Successfully built pyngrok
Installing collected packages: pyngrok
Successfully installed pyngrok-4.1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flask_ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0

In [None]:
from flask import Flask,request,render_template
from flask_ngrok import run_with_ngrok



In [None]:
!ngrok authtoken 2HFX6zbW3w5q5cZv6703lG5u1HL_3fVS7PL6xxme3ECU4L2qa
app = Flask(__name__,template_folder='/content/drive/MyDrive/mp2_dataset/')
run_with_ngrok(app)

@app.route('/')
def home():
  return render_template("mp.html")
@app.route('/predict',methods=['POST','GET'])
def predict():
  if request.method == 'POST':
    paragraph = request.form['iptext1']
  df = {"excerpt":[paragraph]}
  df = pd.DataFrame(df)
  def solution(df):
    preprocessed_text = preprocessing_function(df)
    final_df = get_useful_features(df, stop_words = set(stopwords.words("english")))
    df_with_more_features = generate_more_features(df)
    df_complete = pd.concat((preprocessed_text, df_with_more_features, final_df), axis = 1)
    df_complete_important_features = df_complete.iloc[:, 1:]
    scaler = MinMaxScaler()
    scaler.fit(df_complete_important_features)
    df_scaled = scaler.transform(df_complete_important_features)
    return model.predict(df_scaled)
  result = solution(df)
  if result >= 0.3:
    res = "EASY"
  elif result >= -1 and result < 1: 
     res = "EASY"
  else:
     res = "HARD"
  return render_template('mp.html',result=res)
  

app.run()

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml
 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


INFO:werkzeug: * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://aff3-34-125-193-93.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


INFO:werkzeug:127.0.0.1 - - [24/Nov/2022 01:59:11] "[37mGET / HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Nov/2022 01:59:11] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
100%|██████████| 1/1 [00:00<00:00, 245.83it/s]
100%|██████████| 1/1 [00:00<00:00, 2686.93it/s]
INFO:werkzeug:127.0.0.1 - - [24/Nov/2022 02:00:21] "[37mPOST /predict HTTP/1.1[0m" 200 -


In [None]:
#Test

paragraph = input()
data = {"excerpt":[paragraph]}
df = pd.DataFrame(data)



The sun is a huge ball of gases. It has a diameter of 1,392,000 km. It is so huge that it can hold millions of planets inside it.


In [None]:
 def solution(df):
    preprocessed_text = preprocessing_function(df)
    print( preprocessed_text)
    final_df = get_useful_features(df, stop_words = set(stopwords.words("english")))
    df_with_more_features = generate_more_features(df)
    df_complete = pd.concat((preprocessed_text, df_with_more_features, final_df), axis = 1)
    df_complete_important_features = df_complete.iloc[:, 1:]
    scaler = MinMaxScaler()
    scaler.fit(df_complete_important_features)
    df_scaled = scaler.transform(df_complete_important_features)
    return model.predict(df_scaled)

In [None]:
result = solution(df)
if result >= 1:
  res = "EASY"
elif result >= -1 and result < 1: 
    res = "MODERATE"
else:
    res = "HARD"
print(res)
print(result)

100%|██████████| 1/1 [00:00<00:00, 113.72it/s]


0    the sun is a huge ball of gas it ha a diameter...
dtype: object


100%|██████████| 1/1 [00:00<00:00, 1004.62it/s]

MODERATE
[0.0801772]



