In [82]:
import numpy as np
import pandas as pd

In [83]:
CLEAN_URL="/cleaned_dataset.csv"

In [84]:
df=pd.read_csv(CLEAN_URL, index_col=0)

In [85]:
def get_question_frequency(dataframe):
  # Get the frequency of all a qid in both 'qid1' and 'qid2' columns
  # Code reference: https://stackoverflow.com/questions/65628578/counting-total-number-of-occurrences-in-selected-multiple-columns-in-pandas
  id_cols = ['qid1', 'qid2'] 
  df_qid_freq = dataframe[id_cols].melt(value_name='all_qid').groupby('all_qid').size().reset_index(name='freq')
  dict_qid_freq = dict(zip(df_qid_freq['all_qid'], df_qid_freq['freq']))

  # freq_qid1 = Frequency of qid1’s (number of times question1 occur in the dataset)
  # freq_qid2 = Frequency of qid2’s (number of times question2 occur in the dataset)
  df['freq_qid1'] = 0
  df['freq_qid2'] = 0
  for index, row in dataframe.iterrows():
    if (index % 50000 == 0):
      print(f'{index}th iteration')
    current_qid1 = row['qid1']
    current_qid2 = row['qid2']
    # Get the frequency if qid matches
    dataframe.at[index, 'freq_qid1'] = dict_qid_freq[current_qid1]
    dataframe.at[index, 'freq_qid2'] = dict_qid_freq[current_qid2]

In [86]:
get_question_frequency(df)

0th iteration
50000th iteration
100000th iteration
150000th iteration
200000th iteration
250000th iteration
300000th iteration
350000th iteration


In [87]:
df[['qid1', 'qid2', 'question1', 'question2', 'freq_qid1', 'freq_qid2']].head(5)

Unnamed: 0,qid1,qid2,question1,question2,freq_qid1,freq_qid2
0,213221,213222,how is the life of a math student? could you d...,which level of prepration is enough for the ex...,1,1
1,536040,536041,how do i control my horny emotions?,how do you control your horniness?,1,1
2,364011,490273,what causes stool color to change to yellow?,what can cause stool to come out as little balls?,2,1
3,155721,7256,what can one do after mbbs?,what do i do after my mbbs ?,3,3
4,279958,279959,where can i find a power outlet for my laptop ...,"would a second airport in sydney, australia be...",1,1


In [88]:
def get_question_length(dataframe):
  # q1len = Length of q1
  # q2len = Length of q2
  dataframe['q1len'] = dataframe['question1'].str.len()
  dataframe['q2len'] = dataframe['question2'].str.len()

In [89]:
get_question_length(df)

In [90]:
df[['question1', 'question2', 'q1len', 'q2len']].head(5)

Unnamed: 0,question1,question2,q1len,q2len
0,how is the life of a math student? could you d...,which level of prepration is enough for the ex...,75,55
1,how do i control my horny emotions?,how do you control your horniness?,35,34
2,what causes stool color to change to yellow?,what can cause stool to come out as little balls?,44,49
3,what can one do after mbbs?,what do i do after my mbbs ?,27,28
4,where can i find a power outlet for my laptop ...,"would a second airport in sydney, australia be...",67,121


In [91]:
def get_question_word_count(dataframe):
  # q1_words = Number of words in Question 1
  # q2_words = Number of words in Question 2
  dataframe['q1_words'] = dataframe['question1'].str.split().str.len()
  dataframe['q2_words'] = dataframe['question2'].str.split().str.len()

In [92]:
get_question_word_count(df)

In [93]:
df[['question1', 'question2', 'q1_words', 'q2_words']].head(5)

Unnamed: 0,question1,question2,q1_words,q2_words
0,how is the life of a math student? could you d...,which level of prepration is enough for the ex...,14,10
1,how do i control my horny emotions?,how do you control your horniness?,7,6
2,what causes stool color to change to yellow?,what can cause stool to come out as little balls?,8,10
3,what can one do after mbbs?,what do i do after my mbbs ?,6,8
4,where can i find a power outlet for my laptop ...,"would a second airport in sydney, australia be...",13,20


In [94]:
def get_question_characters_count(dataframe):
  # q1_characters = Number of distinct characters in Question 1
  # q2_characters = Number of distinct characters in Question 2
  dataframe['q1_characters'] = dataframe['question1'].str.lower().apply(set).map(len)
  dataframe['q2_characters'] = dataframe['question2'].str.lower().apply(set).map(len)

In [95]:
get_question_characters_count(df)

In [96]:
df[['question1', 'question2', 'q1_characters', 'q2_characters']].head(5)

Unnamed: 0,question1,question2,q1_characters,q2_characters
0,how is the life of a math student? could you d...,which level of prepration is enough for the ex...,22,23
1,how do i control my horny emotions?,how do you control your horniness?,16,16
2,what causes stool color to change to yellow?,what can cause stool to come out as little balls?,16,16
3,what can one do after mbbs?,what do i do after my mbbs ?,16,16
4,where can i find a power outlet for my laptop ...,"would a second airport in sydney, australia be...",20,25


In [97]:
def get_question_unique_word_count(dataframe):
  # for testing only
  # dataframe['test_u_words'] = dataframe['test_words'].str.lower().str.split().apply(set).map(len)

  # q1_u_words = Number of unique words in Question 1
  # q2_u_words = Number of unique words in Question 2
  dataframe['q1_u_words'] = dataframe['question1'].str.lower().str.split().apply(set).map(len)
  dataframe['q2_u_words'] = dataframe['question2'].str.lower().str.split().apply(set).map(len)

In [98]:
# for testing only
# test_d = {'test_words': ['my mY', 'Ara ara', 'yaRe Yare daze', 'haha lmao']}
# test_df = pd.DataFrame(data=test_d)
# get_question_unique_word_count(test_df)
# test_df.head(4)

In [99]:
get_question_unique_word_count(df)

In [100]:
df[['question1', 'question2', 'q1_u_words', 'q2_u_words']].head(6)

Unnamed: 0,question1,question2,q1_u_words,q2_u_words
0,how is the life of a math student? could you d...,which level of prepration is enough for the ex...,14,10
1,how do i control my horny emotions?,how do you control your horniness?,7,6
2,what causes stool color to change to yellow?,what can cause stool to come out as little balls?,7,10
3,what can one do after mbbs?,what do i do after my mbbs ?,6,7
4,where can i find a power outlet for my laptop ...,"would a second airport in sydney, australia be...",13,19
5,how not to feel guilty since i am muslim and i...,"i don't beleive i am bulimic, but i force thro...",17,25


In [101]:
from spacy.lang.en import English

def get_question_unique_word_count_wo_stop_word(dataframe):
  nlp1 = English()
  # Get unique words from question in the form of set
  # q1_u_words_w_sw = Number of unique words without stop word in Question 1
  # q2_u_words _w_sw = Number of unique words without stop word in Question 2
  # total_u_words_wo_sw = Total unique word without stop word
  df['q1_u_words_wo_sw'] = dataframe['question1'].str.lower().str.split().apply(set)
  df['q2_u_words_wo_sw'] = dataframe['question2'].str.lower().str.split().apply(set)
  df['total_u_words_wo_sw'] = 0
  for index, row in dataframe.iterrows():
    if (index % 50000 == 0):
      print(f'{index}th iteration')
    # Join the words in unique word set with whitespace to form a string for processing later
    question1_u_words = " ".join(row['q1_u_words_wo_sw'])
    question2_u_words = " ".join(row['q2_u_words_wo_sw'])
    dataframe.at[index,'q1_u_words_wo_sw'] = len([token for token in nlp1(question1_u_words) if not token.is_stop])
    dataframe.at[index,'q2_u_words_wo_sw'] = len([token for token in nlp1(question2_u_words) if not token.is_stop])
    dataframe.at[index, 'total_u_words_wo_sw'] = dataframe.at[index,'q1_u_words_wo_sw'] + dataframe.at[index,'q2_u_words_wo_sw']

In [102]:
get_question_unique_word_count_wo_stop_word(df)

0th iteration
50000th iteration
100000th iteration
150000th iteration
200000th iteration
250000th iteration
300000th iteration
350000th iteration


In [103]:
df[['question1', 'question2', 'q1_u_words_wo_sw', 'q2_u_words_wo_sw', 'total_u_words_wo_sw']].head(5)

Unnamed: 0,question1,question2,q1_u_words_wo_sw,q2_u_words_wo_sw,total_u_words_wo_sw
0,how is the life of a math student? could you d...,which level of prepration is enough for the ex...,7,5,12
1,how do i control my horny emotions?,how do you control your horniness?,4,3,7
2,what causes stool color to change to yellow?,what can cause stool to come out as little balls?,6,6,12
3,what can one do after mbbs?,what do i do after my mbbs ?,2,2,4
4,where can i find a power outlet for my laptop ...,"would a second airport in sydney, australia be...",7,15,22


In [104]:
def get_question_unique_common_word_count(dataframe):
  # Find the intersection between unique words set from question 1 with that of question 2, then get the len of the resulting set
  # word_common = (Number of common unique words in question 1 and question 2).
  dataframe['word_common'] = dataframe.apply(lambda row: len(set(row['question1'].lower().split()).intersection(set(row['question2'].lower().split()))), axis=1)

In [105]:
get_question_unique_common_word_count(df)

In [106]:
df[['question1', 'question2', 'q1_u_words', 'q2_u_words', 'word_common']].head(5)

Unnamed: 0,question1,question2,q1_u_words,q2_u_words,word_common
0,how is the life of a math student? could you d...,which level of prepration is enough for the ex...,14,10,3
1,how do i control my horny emotions?,how do you control your horniness?,7,6,3
2,what causes stool color to change to yellow?,what can cause stool to come out as little balls?,7,10,3
3,what can one do after mbbs?,what do i do after my mbbs ?,6,7,3
4,where can i find a power outlet for my laptop ...,"would a second airport in sydney, australia be...",13,19,2


In [107]:
def get_question_words_total(dataframe):
  # word_total =(Total num of words in question 1 + Total num of words in question 2)
  dataframe['word_total'] = dataframe['q1_words'] + dataframe['q2_words'] 

In [108]:
get_question_words_total(df)

In [109]:
df[['question1', 'question2', 'q1_words', 'q2_words', 'word_total']].head(5)

Unnamed: 0,question1,question2,q1_words,q2_words,word_total
0,how is the life of a math student? could you d...,which level of prepration is enough for the ex...,14,10,24
1,how do i control my horny emotions?,how do you control your horniness?,7,6,13
2,what causes stool color to change to yellow?,what can cause stool to come out as little balls?,8,10,18
3,what can one do after mbbs?,what do i do after my mbbs ?,6,8,14
4,where can i find a power outlet for my laptop ...,"would a second airport in sydney, australia be...",13,20,33


In [110]:
def get_question_word_share(dataframe):
  # word_share = (word_common)/(word_Total)
  dataframe['word_share'] = dataframe['word_common'] / dataframe['word_total']

In [111]:
get_question_word_share(df)

In [112]:
df[['question1', 'question2', 'word_common', 'word_total', 'word_share']].head(5)

Unnamed: 0,question1,question2,word_common,word_total,word_share
0,how is the life of a math student? could you d...,which level of prepration is enough for the ex...,3,24,0.125
1,how do i control my horny emotions?,how do you control your horniness?,3,13,0.230769
2,what causes stool color to change to yellow?,what can cause stool to come out as little balls?,3,18,0.166667
3,what can one do after mbbs?,what do i do after my mbbs ?,3,14,0.214286
4,where can i find a power outlet for my laptop ...,"would a second airport in sydney, australia be...",2,33,0.060606


In [113]:
def get_question_freq_sum(dataframe):
  # freq_sum = sum total of the frequency of qid1 and qid2 (freq_q1 + freq_q2)
  dataframe['freq_sum'] = dataframe['freq_qid1'] + dataframe['freq_qid2']

In [114]:
get_question_freq_sum(df)

In [115]:
df[['question1', 'question2', 'freq_qid1', 'freq_qid2', 'freq_sum']].head(5)

Unnamed: 0,question1,question2,freq_qid1,freq_qid2,freq_sum
0,how is the life of a math student? could you d...,which level of prepration is enough for the ex...,1,1,2
1,how do i control my horny emotions?,how do you control your horniness?,1,1,2
2,what causes stool color to change to yellow?,what can cause stool to come out as little balls?,2,1,3
3,what can one do after mbbs?,what do i do after my mbbs ?,3,3,6
4,where can i find a power outlet for my laptop ...,"would a second airport in sydney, australia be...",1,1,2


In [116]:
def get_question_freq_diff(dataframe):
  # freq_diff = absolute difference of frequency of qid1 and qid2 (abs(freq_q1-freq_q2))
  dataframe['freq_diff'] = abs(dataframe['freq_qid1'] - dataframe['freq_qid2'])

In [117]:
get_question_freq_diff(df)

In [118]:
df[['question1', 'question2', 'freq_qid1', 'freq_qid2', 'freq_diff']].head(5)

Unnamed: 0,question1,question2,freq_qid1,freq_qid2,freq_diff
0,how is the life of a math student? could you d...,which level of prepration is enough for the ex...,1,1,0
1,how do i control my horny emotions?,how do you control your horniness?,1,1,0
2,what causes stool color to change to yellow?,what can cause stool to come out as little balls?,2,1,1
3,what can one do after mbbs?,what do i do after my mbbs ?,3,3,0
4,where can i find a power outlet for my laptop ...,"would a second airport in sydney, australia be...",1,1,0


In [124]:
df = df.drop(labels=['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'], axis=1)

In [126]:
df.head(5)

Unnamed: 0,freq_qid1,freq_qid2,q1len,q2len,q1_words,q2_words,q1_characters,q2_characters,q1_u_words,q2_u_words,q1_u_words_wo_sw,q2_u_words_wo_sw,total_u_words_wo_sw,word_common,word_total,word_share,freq_sum,freq_diff
0,1,1,75,55,14,10,22,23,14,10,7,5,12,3,24,0.125,2,0
1,1,1,35,34,7,6,16,16,7,6,4,3,7,3,13,0.230769,2,0
2,2,1,44,49,8,10,16,16,7,10,6,6,12,3,18,0.166667,3,1
3,3,3,27,28,6,8,16,16,6,7,2,2,4,3,14,0.214286,6,0
4,1,1,67,121,13,20,20,25,13,19,7,15,22,2,33,0.060606,2,0


In [127]:
# 18 additional features

# Features summary:
# 1. freq_qid1
# 2. freq_qid2
# 3. q1len
# 4. q2len
# 5. q1_words
# 6. q2_words
# 7. q1_characters
# 8. q2_characters
# 9. q1_u_words
# 10. q2_u_words
# 11. q1_u_words_wo_sw
# 12. q2_u_words _wo_sw
# 13. total_u_words_wo_sw
# 14. word_common
# 15. word_total
# 16. word_share
# 17. freq_sum
# 18. freq_diff 

df.shape

(363846, 18)

In [129]:
df.to_csv("/basic_features.csv", index=False)