In [None]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.14.0 textstat-0.7.3


In [None]:
import joblib
import numpy as np
import pandas as pd
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from collections import defaultdict
from collections import Counter
from sklearn.preprocessing import StandardScaler
from textstat import flesch_reading_ease, flesch_kincaid_grade, automated_readability_index, gunning_fog, coleman_liau_index, linsear_write_formula, dale_chall_readability_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
model1 = joblib.load('linguistic_regressor.pkl')
model2 = joblib.load('readability_regressor_model.pkl')
model3 = joblib.load('TFIDF_model.joblib')

In [None]:
def scale_value(value, old_min, old_max, new_min, new_max):
    scaled_value = (value - old_min) * (new_max - new_min) / (old_max - old_min) + new_min
    return scaled_value

In [None]:
def count_pos_tags(dataframe, question_column):
    unique_pos_tags = set()

    def update_unique_pos_tags(text):
        tokens = word_tokenize(text)
        pos_tags = pos_tag(tokens)
        unique_pos_tags.update(tag for word, tag in pos_tags)

    dataframe[question_column].apply(update_unique_pos_tags)

    for pos_tag_count in unique_pos_tags:
        pos_tag_column = f'{pos_tag_count}'
        dataframe[pos_tag_column] = dataframe[question_column].apply(lambda x: sum(1 for _, tag in pos_tag(word_tokenize(x)) if tag == pos_tag_count))

    return dataframe

In [None]:
def linguistic_score(text):
  number_of_words = len(str(text).split())
  number_of_sentences = len(text.split("."))
  avg_word_length= sum(len(word) for word in str(text).split()) / len(str(text).split())

  df = pd.DataFrame(columns = ["question"])
  df['question'] =[text]
  tags = count_pos_tags(df,"question")
  tags["number_of_words"] = number_of_words
  tags["number_of_sentences"] = number_of_sentences
  tags["avg_word_length"] = avg_word_length

  input = pd.read_csv("tags_order.csv")
  input.drop(index=input.index, inplace=True)

  common_columns = tags.columns.intersection(input.columns)
  input[common_columns] = tags[common_columns]
  input = input.fillna(0)
  input = input.iloc[:,1:]
  y_pred = model1.predict(input)

  scaled_value = scale_value(y_pred[0], 1, 170, 1, 10)

  return scaled_value

In [None]:
def readability_score(question):
  input = []
  input.append(flesch_reading_ease(question))
  input.append(flesch_kincaid_grade(question))
  input.append(automated_readability_index(question))
  input.append(gunning_fog(question))
  input.append(coleman_liau_index(question))
  input.append(linsear_write_formula(question))
  input.append(dale_chall_readability_score(question))

  input = [input]
  pred = model2.predict(input)

  scaled_value = scale_value(pred[0], 5, 30, 1, 10)
  return scaled_value

In [None]:
def tfidf_score(question):
  vectorizer = joblib.load('TFIDF_QA_vectorizer.joblib' )
  X = vectorizer.transform([question])
  difficulty_prediction = model3.predict(X)[0]

  return difficulty_prediction

In [None]:
def final_prediction(question) :

  linguistic = linguistic_score(question)
  readability = readability_score(question)
  tfidf = tfidf_score(question)

  final_score = np.mean([linguistic , readability , tfidf],axis = 0)

  return np.round(final_score,3)

In [None]:
question = "what is happening to the environment that is causing gloabal warming"

difficulty = final_prediction(question)
print("Difficulty: " + str(difficulty) + "/10")

Difficulty: 5.375/10




In [None]:
import ipywidgets as widgets
from IPython.display import display

text_input = widgets.Text(
    value='',
    placeholder='Enter text here',
    description='Input your question here:',
    disabled=False
)
display(text_input)

Text(value='', description='Input your question here:', placeholder='Enter text here')

In [None]:
user_question = text_input.value
print("Your question:", user_question)

Your question: what is happening to the environment that is causing gloabal warming


In [None]:
difficulty = final_prediction(user_question)
print("Difficulty: " + str(difficulty) + "/10")

Difficulty: 5.375/10


