## **Importing Libraries**

In [2]:
import re
import string
from nltk import WordNetLemmatizer
import pandas as pd
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
import matplotlib.pyplot as plt
from textblob import TextBlob

## **Read Dataset**

In [3]:
Questions = pd.read_csv('Questions.csv', encoding='latin')
Answers = pd.read_csv('Answers.csv', encoding='latin')
Tags = pd.read_csv('Tags.csv', encoding='latin')

In [4]:
print(Questions.head(3))
print(Answers.head(3))
print(Tags.head(3))


    Id  OwnerUserId          CreationDate            ClosedDate  Score  \
0   80         26.0  2008-08-01T13:57:07Z                   NaN     26   
1   90         58.0  2008-08-01T14:41:24Z  2012-12-26T03:45:49Z    144   
2  120         83.0  2008-08-01T15:50:08Z                   NaN     21   

                                               Title  \
0  SQLStatement.execute() - multiple queries in o...   
1  Good branching and merging tutorials for Torto...   
2                                  ASP.NET Site Maps   

                                                Body  
0  <p>I've written a database generation script i...  
1  <p>Are there any really good tutorials explain...  
2  <p>Has anyone got experience creating <strong>...  
    Id  OwnerUserId          CreationDate  ParentId  Score  \
0   92         61.0  2008-08-01T14:45:37Z        90     13   
1  124         26.0  2008-08-01T16:09:47Z        80     12   
2  199         50.0  2008-08-01T19:36:46Z       180      1   

         

In [5]:
Questions.columns = ['Id', 'OwnerUserId', 'CreationDate', 'CloseDate', 'Score', 'Title', 'Question']
Answers.columns = ['Id_normal', 'OwnerUserId', 'CreationDate', 'Id', 'Score', 'Answer']
Answers.drop(columns=['Id_normal', 'OwnerUserId', 'CreationDate'], inplace=True)
Answers = Answers.groupby('Id')['Answer'].apply(lambda answer: ' '.join(answer))
Answers = Answers.to_frame().reset_index()
print(Answers)

               Id                                             Answer
0              80  <p>I wound up using this. It is a kind of a ha...
1              90  <p><a href="http://svnbook.red-bean.com/">Vers...
2             120  <p>The Jeff Prosise version from MSDN magazine...
3             180  <p>I've read somewhere the human eye can't dis...
4             260  <p>Yes, I thought about that, but I soon figur...
...           ...                                                ...
1102563  40142860  <p>It's faster and more reliable to work with ...
1102564  40142900  <p>It's not you, it's LinkedIn. See others com...
1102565  40142910  <p>Try add <code>retrun false</code> in the <c...
1102566  40142940  <p>Here's how you can do it:</p>\n\n<pre><code...
1102567  40143190  <p>Use a here-doc:</p>\n\n<pre><code>result=$(...

[1102568 rows x 2 columns]


In [6]:
Tags['Tag'] = Tags['Tag'].astype(str)
Tags = Tags.groupby('Id')['Tag'].apply(lambda tag: ' '.join(tag))
Tags = Tags.to_frame().reset_index()

print(Tags)

               Id                                             Tag
0              80                         flex actionscript-3 air
1              90    svn tortoisesvn branch branching-and-merging
2             120                             sql asp.net sitemap
3             180  algorithm language-agnostic colors color-space
4             260         c# .net scripting compiler-construction
...           ...                                             ...
1264211  40143210                                   php .htaccess
1264212  40143300                                 google-bigquery
1264213  40143340                          android android-studio
1264214  40143360                               javascript vue.js
1264215  40143380                                 npm mocha babel

[1264216 rows x 2 columns]


In [7]:
new_data = Questions.merge(Answers, how='left', on='Id')
new_data = new_data.merge(Tags, how='left', on='Id')
new_data.drop(columns=['OwnerUserId', 'CreationDate', 'CloseDate'], inplace=True)
new_data.columns = ['id', 'score', 'title', 'question', 'answer', 'tag']

count = new_data.groupby('tag')['tag'].count()
count = count.to_frame()
count.columns = ['TagCount']
count = count.reset_index()

print(count)

                                         tag  TagCount
0                                         .a         1
1                                  .htaccess       340
2                        .htaccess .htpasswd         4
3       .htaccess absolute-path relative-url         1
4                     .htaccess addon-domain         1
...                                      ...       ...
685695     zurb-foundation zurb-foundation-5         3
685696     zurb-foundation zurb-foundation-6         1
685697                     zurb-foundation-6         1
685698                                  zuul         1
685699                            zxing zbar         1

[685700 rows x 2 columns]


In [8]:
new_data = pd.merge(new_data, count, how='left', on='tag')
new_data = new_data.dropna()
new_data = new_data[(new_data['TagCount'] >= 1100) & (new_data['score'] > 7)]


print("Data After Preparation")
print(new_data)
new_data.drop(columns=['score', 'id', 'TagCount'], inplace=True)

Data After Preparation
               id  score                                              title  \
11           1010     14  How to get the value of built, encoded ViewState?   
23           2120     77                       Convert HashBytes to VarChar   
34           2900     14              MySQL/Apache Error in PHP MySQL query   
48           4230     34  The Difference Between a DataGrid and a GridVi...   
192         17870     11                        Select ..... where .... OR    
...           ...    ...                                                ...   
1228605  39261590     10  Edittext cursor still blinks after closing the...   
1234096  39410260     13  App is crashing after capturing picture using ...   
1235166  39439760     10      How to preview multiple images before upload?   
1249431  39797350      8                    Log.wtf vs. Unhandled Exception   
1251020  39834660      8  Optimizing the use of arguments inside a function   

                            

## **Word Lematization**

In [9]:
Lematizer = WordNetLemmatizer()

def punctuation_remover(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

def Word_Lemmatizer(text):
    lemma = [Lematizer.lemmatize(word) for word in text]
    return lemma

new_data['title'] = new_data['title'].astype(str)
new_data['question'] = new_data['question'].astype(str)
new_data['answer'] = new_data['answer'].astype(str)

new_data['title'] = new_data['title'].apply(punctuation_remover)
new_data['question'] = new_data['question'].apply(punctuation_remover)
new_data['answer'] = new_data['answer'].apply(punctuation_remover)
print("Data after punctuation_remover ")
print(new_data)



Data after punctuation_remover 
                                                     title  \
11         How to get the value of built encoded ViewState   
23                            Convert HashBytes to VarChar   
34                    MySQLApache Error in PHP MySQL query   
48       The Difference Between a DataGrid and a GridVi...   
192                                     Select  where  OR    
...                                                    ...   
1228605  Edittext cursor still blinks after closing the...   
1234096  App is crashing after capturing picture using ...   
1235166       How to preview multiple images before upload   
1249431                      Logwtf vs Unhandled Exception   
1251020  Optimizing the use of arguments inside a function   

                                                  question  \
11       pI need to grab the base64encoded representati...   
23       pI want to get the MD5 Hash of a string value ...   
34       pI am getting the following 

In [10]:
new_data['title'] = new_data['title'].str.lower()
new_data['question'] = new_data['question'].str.lower()
new_data['answer'] = new_data['answer'].str.lower()

new_data['question'] = new_data['question'].apply(lambda question: re.sub('<[^<]+?>', '', question))
new_data['answer'] = new_data['answer'].apply(lambda answer: re.sub('<[^<]+?>', '', answer))
new_data['title'] = new_data['title'].apply(lambda title: re.sub('<[^<]+?>', '', title))
print("Data after Removing HTML tags and Changing texts into lowercase")
print(new_data)

Data after Removing HTML tags and Changing texts into lowercase
                                                     title  \
11         how to get the value of built encoded viewstate   
23                            convert hashbytes to varchar   
34                    mysqlapache error in php mysql query   
48       the difference between a datagrid and a gridvi...   
192                                     select  where  or    
...                                                    ...   
1228605  edittext cursor still blinks after closing the...   
1234096  app is crashing after capturing picture using ...   
1235166       how to preview multiple images before upload   
1249431                      logwtf vs unhandled exception   
1251020  optimizing the use of arguments inside a function   

                                                  question  \
11       pi need to grab the base64encoded representati...   
23       pi want to get the md5 hash of a string value ...   
34   

In [11]:
new_data['question'] = new_data['question'].str.split()
new_data['answer'] = new_data['answer'].str.split()
new_data['title'] = new_data['title'].str.split()

new_data['title'] = new_data['title'].apply(lambda title: Word_Lemmatizer(title))
new_data['answer'] = new_data['answer'].apply(lambda answer: Word_Lemmatizer(answer))
new_data['question'] = new_data['question'].apply(lambda question: Word_Lemmatizer(question))
print("After Word Lemmatixer")
print(new_data)

After Word Lemmatixer
                                                     title  \
11       [how, to, get, the, value, of, built, encoded,...   
23                       [convert, hashbytes, to, varchar]   
34             [mysqlapache, error, in, php, mysql, query]   
48       [the, difference, between, a, datagrid, and, a...   
192                                    [select, where, or]   
...                                                    ...   
1228605  [edittext, cursor, still, blink, after, closin...   
1234096  [app, is, crashing, after, capturing, picture,...   
1235166  [how, to, preview, multiple, image, before, up...   
1249431                  [logwtf, v, unhandled, exception]   
1251020  [optimizing, the, use, of, argument, inside, a...   

                                                  question  \
11       [pi, need, to, grab, the, base64encoded, repre...   
23       [pi, want, to, get, the, md5, hash, of, a, str...   
34       [pi, am, getting, the, following, erro

In [12]:
new_data['title'] = new_data['title'].apply(lambda title: [word for word in title if word not in stopwords.words('english')])
new_data['question'] = new_data['question'].apply(lambda question: [word for word in question if word not in stopwords.words('english')])
new_data['answer'] = new_data['answer'].apply(lambda answer: [word for word in answer if word not in stopwords.words('english')])
print("data after stop words")
print(new_data)

data after stop words
                                                     title  \
11                 [get, value, built, encoded, viewstate]   
23                           [convert, hashbytes, varchar]   
34                 [mysqlapache, error, php, mysql, query]   
48                [difference, datagrid, gridview, aspnet]   
192                                               [select]   
...                                                    ...   
1228605  [edittext, cursor, still, blink, closing, soft...   
1234096  [app, crashing, capturing, picture, using, int...   
1235166                 [preview, multiple, image, upload]   
1249431                  [logwtf, v, unhandled, exception]   
1251020      [optimizing, use, argument, inside, function]   

                                                  question  \
11       [pi, need, grab, base64encoded, representation...   
23       [pi, want, get, md5, hash, string, value, sql,...   
34       [pi, getting, following, errorp, block

## **Sentiment Analysis**

In [13]:
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

# Adding sentiment scores to the dataset
new_data['title_sentiment'] = new_data['title'].apply(lambda x: get_sentiment(str(x)))
new_data['question_sentiment'] = new_data['question'].apply(lambda x: get_sentiment(str(x)))
new_data['answer_sentiment'] = new_data['answer'].apply(lambda x: get_sentiment(' '.join(str(x))))

# Display some sentiment scores
print("Sentiment Scores:")
print(new_data[['title', 'title_sentiment', 'question', 'question_sentiment', 'answer' , "answer_sentiment"]].head())

Sentiment Scores:
                                        title  title_sentiment  \
11    [get, value, built, encoded, viewstate]              0.0   
23              [convert, hashbytes, varchar]              0.0   
34    [mysqlapache, error, php, mysql, query]              0.0   
48   [difference, datagrid, gridview, aspnet]              0.0   
192                                  [select]              0.0   

                                              question  question_sentiment  \
11   [pi, need, grab, base64encoded, representation...            0.025000   
23   [pi, want, get, md5, hash, string, value, sql,...            0.000000   
34   [pi, getting, following, errorp, blockquote, p...            0.188095   
48   [pive, aspnet, development, little, ive, used,...            0.218750   
192  [pi, way, select, data, one, multiple, conditi...           -0.083333   

                                                answer  answer_sentiment  
11   [prex, suspect, good, place, start, 

## **TF-IDF**

In [14]:
vectorizer = TfidfVectorizer()
new_data['title'] = new_data['title'].astype(str)
new_data['answer'] = new_data['answer'].astype(str)
new_data['question'] = new_data['question'].astype(str)

In [15]:

X1 = vectorizer.fit_transform(new_data['title'].str.lower())
X2 = vectorizer.fit_transform(new_data['answer'].str.lower())
X3 = vectorizer.fit_transform(new_data['question'].str.lower())

In [17]:
label_encoder = LabelEncoder()
new_data['tag'] = label_encoder.fit_transform(new_data['tag'])
print(new_data)
y = new_data['tag'].values
print(y)


                                                     title  \
11       ['get', 'value', 'built', 'encoded', 'viewstate']   
23                     ['convert', 'hashbytes', 'varchar']   
34       ['mysqlapache', 'error', 'php', 'mysql', 'query']   
48        ['difference', 'datagrid', 'gridview', 'aspnet']   
192                                             ['select']   
...                                                    ...   
1228605  ['edittext', 'cursor', 'still', 'blink', 'clos...   
1234096  ['app', 'crashing', 'capturing', 'picture', 'u...   
1235166         ['preview', 'multiple', 'image', 'upload']   
1249431          ['logwtf', 'v', 'unhandled', 'exception']   
1251020  ['optimizing', 'use', 'argument', 'inside', 'f...   

                                                  question  \
11       ['pi', 'need', 'grab', 'base64encoded', 'repre...   
23       ['pi', 'want', 'get', 'md5', 'hash', 'string',...   
34       ['pi', 'getting', 'following', 'errorp', 'bloc...   
48     

In [18]:
x_train, x_test, y_train, y_test = train_test_split(X2, new_data['tag'], test_size=0.4, random_state=10)
x_train_svm, x_test_svm, y_train_svm, y_test_svm = train_test_split(X2, new_data['tag'], test_size=0.35, random_state=10)

models = {
    'KNN': KNeighborsClassifier(),
    'SVM': svm.SVC(kernel='linear', C=10, random_state=0),
    'Random Forest': RandomForestClassifier(n_estimators=2000),
    'Decision Tree': DecisionTreeClassifier(random_state=10),
    'GBM': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=10),
    'Logistic Regression': LogisticRegression()
}

accuracies = {}

for model_name, model in models.items():
    if model_name == 'KNN':
        accuracy = []
        for i in range(1, 100):
            knn = KNeighborsClassifier(n_neighbors=i).fit(x_train, y_train)
            prediction = knn.predict(x_test)
            accuracy.append(accuracy_score(y_test, prediction))
        accuracies[model_name] = max(accuracy)
    elif model_name == 'SVM':
        svm_model = svm.SVC(kernel='linear', C=10, random_state=0).fit(x_train_svm, y_train_svm)
        pred_svm = svm_model.predict(x_test_svm)
        accuracies[model_name] = accuracy_score(y_test_svm, pred_svm)
    else:
        model.fit(x_train, y_train)
        pred = model.predict(x_test)
        accuracies[model_name] = accuracy_score(y_test, pred)

# Print accuracies
for model_name, accuracy in accuracies.items():
    print(f'Accuracy of {model_name}: {accuracy}')



Accuracy of KNN: 0.5157715260017051
Accuracy of SVM: 0.5657254138266796
Accuracy of Random Forest: 0.5532821824381927
Accuracy of Decision Tree: 0.4390451832907076
Accuracy of GBM: 0.5268542199488491
Accuracy of Logistic Regression: 0.4475703324808184
