RELATIONSHIP BETWEEN VARIABLES TO PREDICT QUESTIONS ANSWERED AFTER EMAIL NOTIFICATION IS SENT

In [None]:
#importing libraries
import pandas as pd
import numpy as np
import datetime

In [454]:
#reading data
ques=pd.read_csv("questions.csv")
ans = pd.read_csv("answers_metrics.csv")

In [455]:
#merging questions dataset with answer_metrics to get sentiment score
ques_ans = ques.merge(ans, left_on='questions_id', right_on='answers_question_id', how='left',suffixes=('', '_drop'))
ques_ans.drop([col for col in ques_ans.columns if 'drop' in col], axis=1, inplace=True)


In [456]:
#formatting dates
ques_ans['first_email_sent'] = ques_ans['first_email_sent'].str[:19]
ques_ans['first_answer_posted'] = ques_ans['first_answer_posted'].str[:19]
ques_ans['questions_date_added'] = ques_ans['questions_date_added'].str[:19]

ques_ans['first_email_sent'] = pd.to_datetime(ques_ans['first_email_sent'], format='%Y-%m-%d %H:%M:%S')
ques_ans['first_answer_posted'] = pd.to_datetime(ques_ans['first_answer_posted'], format='%Y-%m-%d %H:%M:%S')
ques_ans['questions_date_added'] = pd.to_datetime(ques_ans['questions_date_added'], format='%Y-%m-%d %H:%M:%S')

In [457]:
#calculating time difference between first email sent and first answer posted
ques_ans['diff_hours'] = ques_ans['first_answer_posted'] - ques_ans['first_email_sent']
ques_ans['diff_hours']=ques_ans['diff_hours']/np.timedelta64(1,'h')

In [458]:
#dataframe where answers are posted without any email notifications sent to the professionals
ques_ans_withoutemails = ques_ans[ques_ans['first_email_sent'] > ques_ans['first_answer_posted']]

In [459]:
#dataframe where answers are posted only after atleast one email is sent to the professionals
ques_ans_withemails = pd.concat([ques_ans, ques_ans_withoutemails, ques_ans_withoutemails]).drop_duplicates(keep=False)

In [460]:
#checking percentage of null values
ques_ans_withemails.isnull().sum() / ques_ans_withemails.shape[0] * 100

Unnamed: 0                       0.000000
questions_id                     0.000000
questions_author_id              0.000000
questions_date_added             0.000000
questions_title                  0.000000
questions_body                   0.000000
question_score                   0.000000
all_tags                         3.330947
tag_count                        0.000000
emails_sent_count                0.000000
professionals_notified_count     0.000000
first_email_sent                 9.287489
last_email_sent                  9.287489
total_answers                    0.000000
first_answer_posted              1.728639
last_answer_posted               1.728639
total_comments                   0.000000
first_comment_posted            68.294943
last_comment_posted             94.035036
students_location                4.804817
students_date_joined             0.869584
answers_id                       1.728639
answers_date_added               1.728639
answers_question_id              1

In [461]:
#handling null values
ques_ans_withemails = ques_ans_withemails.drop(columns=['first_comment_posted','last_comment_posted', 'Pro_industry_category'])

ques_ans_withemails = ques_ans_withemails.dropna(subset=['all_tags','first_email_sent','last_email_sent','first_answer_posted',
                                                         'last_answer_posted','students_location','students_date_joined','answers_id',
                                                         'answers_date_added','answers_question_id','answers_author_id',
                                                         'professionals_location','answer_tag_sim_index','answer_score','sentiment_score','sentiment_category','Dominant_Topic',
                                                        'Topic_Perc_Contrib','diff_hours'])


In [462]:
#checking percentage of null values
ques_ans_withemails.isnull().sum() / ques_ans_withemails.shape[0] * 100

Unnamed: 0                      0.0
questions_id                    0.0
questions_author_id             0.0
questions_date_added            0.0
questions_title                 0.0
questions_body                  0.0
question_score                  0.0
all_tags                        0.0
tag_count                       0.0
emails_sent_count               0.0
professionals_notified_count    0.0
first_email_sent                0.0
last_email_sent                 0.0
total_answers                   0.0
first_answer_posted             0.0
last_answer_posted              0.0
total_comments                  0.0
students_location               0.0
students_date_joined            0.0
answers_id                      0.0
answers_date_added              0.0
answers_question_id             0.0
answers_author_id               0.0
professionals_location          0.0
answer_tag_sim_index            0.0
answer_score                    0.0
sentiment_score                 0.0
sentiment_category          

In [463]:
for col in ques_ans_withemails.columns:
    print(col)

Unnamed: 0
questions_id
questions_author_id
questions_date_added
questions_title
questions_body
question_score
all_tags
tag_count
emails_sent_count
professionals_notified_count
first_email_sent
last_email_sent
total_answers
first_answer_posted
last_answer_posted
total_comments
students_location
students_date_joined
answers_id
answers_date_added
answers_question_id
answers_author_id
professionals_location
answer_tag_sim_index
answer_score
sentiment_score
sentiment_category
Dominant_Topic
Topic_Perc_Contrib
diff_hours


In [464]:
#MODEL 1 - Linear Regression on Response Time after Email Notification is sent

#Splitting dataset
X = ques_ans_withemails.iloc[:,[6,8,9,13,16,25,26]]
Y = ques_ans_withemails.iloc[: , -1]
X

Unnamed: 0,question_score,tag_count,emails_sent_count,total_answers,total_comments,answer_score,sentiment_score
0,1.0,3.0,200.0,1.0,1.0,0.0,0.481070
1,5.0,2.0,62.0,2.0,0.0,0.0,1.248724
2,5.0,2.0,62.0,2.0,0.0,0.0,1.285241
6,2.0,4.0,320.0,2.0,0.0,0.0,0.360555
7,2.0,4.0,320.0,2.0,0.0,0.0,0.242536
...,...,...,...,...,...,...,...
51939,4.0,2.0,38.0,4.0,0.0,0.0,0.472456
51940,10.0,4.0,190.0,4.0,3.0,1.0,0.750851
51941,10.0,4.0,190.0,4.0,3.0,1.0,1.171557
51942,10.0,4.0,190.0,4.0,3.0,2.0,0.982500


In [465]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X,Y)

LinearRegression()

In [466]:
# print the intercept
#The intercept (often labeled the constant) is the expected mean value of Y when all X=0. 
print(model.intercept_)

1474.5507063236832


In [467]:
#The sign of each coefficient indicates the direction of the relationship between a predictor variable and the response variable.
coeff_parameter = pd.DataFrame(model.coef_,X.columns,columns=['Coefficient'])
coeff_parameter

Unnamed: 0,Coefficient
question_score,10.494921
tag_count,62.484478
emails_sent_count,0.016415
total_answers,-89.134509
total_comments,-76.396818
answer_score,-309.07412
sentiment_score,-163.940327


In [468]:
import statsmodels.api as sm
X_Sm= sm.add_constant(X)

ls=sm.OLS(Y,X_Sm).fit()
print(ls.summary())

                            OLS Regression Results                            
Dep. Variable:             diff_hours   R-squared:                       0.040
Model:                            OLS   Adj. R-squared:                  0.039
Method:                 Least Squares   F-statistic:                     218.2
Date:                Mon, 09 May 2022   Prob (F-statistic):          4.03e-319
Time:                        14:19:04   Log-Likelihood:            -3.4516e+05
No. Observations:               37118   AIC:                         6.903e+05
Df Residuals:                   37110   BIC:                         6.904e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const              1474.5507     34.45

  x = pd.concat(x[::order], 1)


In [469]:
# Model 2 - Predicting total answers based on time taken to respond after first email sent
X2 = ques_ans_withemails.iloc[:,[6,8,9,26,30]]
Y2 = ques_ans_withemails.iloc[: , [13]]
X2


Unnamed: 0,question_score,tag_count,emails_sent_count,sentiment_score,diff_hours
0,1.0,3.0,200.0,0.481070,80.429444
1,5.0,2.0,62.0,1.248724,17061.496944
2,5.0,2.0,62.0,1.285241,17061.496944
6,2.0,4.0,320.0,0.360555,688.219722
7,2.0,4.0,320.0,0.242536,688.219722
...,...,...,...,...,...
51939,4.0,2.0,38.0,0.472456,43.681389
51940,10.0,4.0,190.0,0.750851,0.100000
51941,10.0,4.0,190.0,1.171557,0.100000
51942,10.0,4.0,190.0,0.982500,0.100000


In [470]:
model.fit(X2,Y2)

LinearRegression()

In [471]:
X2_Sm= sm.add_constant(X2)

ls=sm.OLS(Y2,X2_Sm).fit()
print(ls.summary())

                            OLS Regression Results                            
Dep. Variable:          total_answers   R-squared:                       0.508
Model:                            OLS   Adj. R-squared:                  0.508
Method:                 Least Squares   F-statistic:                     7670.
Date:                Mon, 09 May 2022   Prob (F-statistic):               0.00
Time:                        14:19:18   Log-Likelihood:                -89360.
No. Observations:               37118   AIC:                         1.787e+05
Df Residuals:                   37112   BIC:                         1.788e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 1.8624      0.03

  x = pd.concat(x[::order], 1)
