#Import dependecies

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import nltk                                # Python library for NLP
import re
import matplotlib.pyplot as plt
import time

In [2]:
# #iplot imports and config
# import cufflinks as cf
# cf.go_offline()
# cf.set_config_file(offline=False, world_readable=True)
# import plotly.io as pio
# pio.renderers.default = 'colab'

In [3]:
%%time
ques = pd.read_csv("data/Questions.csv",encoding = "ISO-8859-1")

CPU times: user 7.01 s, sys: 389 ms, total: 7.4 s
Wall time: 7.39 s


In [4]:
ques.head(10)

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...
5,742,189.0,2008-08-03T15:55:28Z,30,Class views in Django,"<p><a href=""http://www.djangoproject.com/"">Dja..."
6,766,1384652.0,2008-08-03T17:44:07Z,20,Python and MySQL,<p>I can get Python to work with Postgresql bu...
7,773,207.0,2008-08-03T18:27:09Z,256,How do I use Python's itertools.groupby()?,<p>I haven't been able to find an understandab...
8,972,145.0,2008-08-04T02:17:51Z,364,Adding a Method to an Existing Object Instance,<p>I've read that it is possible to add a meth...
9,1476,92.0,2008-08-04T18:20:36Z,251,How do you express binary literals in Python?,<p>How do you express an integer as a binary n...


In [5]:
def split_html(html_string):
  """
    input:
      html_string (HTML string representing question body or answer body)
    return:
      tuple(text,code)
      text:text without the code(String)
      code:list of codeblocks(list)
  """
  bs = BeautifulSoup(html_string,"html.parser")
  code_list = [code_block.extract().get_text() for code_block in bs.find_all("code")]
  return bs.get_text(),code_list

### Great example
There is a great example showing our first problem when splitting
take a look at this answer https://stackoverflow.com/a/595  
The following code show the same answer in our dataset
before spliting the code and text and after.  

The problem is that an answer in stack-overflow most like be splited into more than one paragraph.  
most likely the structure of one answer will something like this:-  
-explantion block  
-code block  
-explantion block  
-code block  
-explantion block  
-code block  

if we concatenate the text togther without code, i think(personal opinion) we will lose the meaning.  
if we merged diff codeblock into one codeblock,we may have syntax error.  
I will leave this for now and move on to the rest of the notebook.

In [6]:
print("before splitting:-\n",ques.loc[5,"Body"])
get_text,get_code = split_html(ques.loc[5,"Body"])


before splitting:-
 <p><a href="http://www.djangoproject.com/">Django</a> view points to a function, which can be a problem if you want to change only a bit of functionality. Yes, I could have million keyword arguments and even more if statements in the function, but I was thinking more of an object oriented approach.</p>

<p>For example, I have a page that displays a user. This page is very similar to page that displays a group, but it's still not so similar to just use another data model. Group also has members etc...</p>

<p>One way would be to point views to class methods and then extend that class. Has anyone tried this approach or has any other idea? </p>


In [7]:
print("Text part after splitting:-\n\n\n",get_text)

Text part after splitting:-


 Django view points to a function, which can be a problem if you want to change only a bit of functionality. Yes, I could have million keyword arguments and even more if statements in the function, but I was thinking more of an object oriented approach.
For example, I have a page that displays a user. This page is very similar to page that displays a group, but it's still not so similar to just use another data model. Group also has members etc...
One way would be to point views to class methods and then extend that class. Has anyone tried this approach or has any other idea? 


In [8]:
print("code part after splitting:-\n",*get_code,sep="\n-----------new codeblock-----------\n\n")

code part after splitting:-



### Split HTML

In [9]:
%%time
splitted = ques['Body'].apply(split_html)

CPU times: user 4min 3s, sys: 501 ms, total: 4min 4s
Wall time: 4min 4s


In [10]:
%%time
text = [x[0] for x in splitted]
code = [x[1] for x in splitted]
ques["Text"]=pd.Series(text)
ques["Code"]=pd.Series(code)

CPU times: user 577 ms, sys: 16 ms, total: 593 ms
Wall time: 592 ms


###Remove newlines

In [11]:
def remove_newlines(text):
  return re.sub('(\r\n)+|\r+|\n+', " ",  text)

###Normalize

In [12]:
def normalize(text):
  return text.lower()

###Tokenize

In [13]:
def tokenize(text):
  #just word splitting for now for under_standing the data
  return text.split()

###Remove stopwords


In [14]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/mohamd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
stop_words = set(stopwords.words('english'))

In [16]:
def remove_stopwords(tokens):
  return [w for w in tokens if not w in stop_words] 

###Clean Text and Code

In [17]:
def clean_text(text):
    return remove_stopwords(tokenize(normalize(remove_newlines(text))))

In [18]:
def clean_code(code):
  return tokenize(code)

###Process Dataset

In [None]:
%%time
ques["TitleClean"] = ques["Title"].apply(clean_text)
ques["TextClean"] = ques["Text"].apply(clean_text)
ques["CodeClean"] = ques["Code"].apply(lambda lst:[ clean_code(code) for code in lst])



In [19]:
ques.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body,Text,Code
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,I am using the Photoshop's javascript API to f...,[]
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,I have a cross-platform (Python) application w...,[]
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,I'm starting work on a hobby project with a py...,[]
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...,There are several ways to iterate over a resul...,[]
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...,I don't remember whether I was dreaming or not...,"[foo in iter_attr(array of python objects, att..."


In [21]:

que = ques.drop(["CreationDate","OwnerUserId","Id","Score"],axis=1)
que.head()

Unnamed: 0,Title,Body,Text,Code
0,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,I am using the Photoshop's javascript API to f...,[]
1,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,I have a cross-platform (Python) application w...,[]
2,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,I'm starting work on a hobby project with a py...,[]
3,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...,There are several ways to iterate over a resul...,[]
4,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...,I don't remember whether I was dreaming or not...,"[foo in iter_attr(array of python objects, att..."


##Save preprocessed dataset

In [23]:
#save preprocessed questions data
que.to_pickle("./Que.pkl")

#Load PreProcessed Data

In [None]:
#load PreProcessed data
ques = pd.read_pickle("./Ques.pkl")
ans = pd.read_pickle("./Ans.pkl")
tags = pd.read_pickle("./Tags.pkl")

# Statistics


In [None]:
pd.set_option('display.max_colwidth', None)

##General methods

###N-Grams

In [None]:
def n_gram(n,pandas_series,top_k=20):
  """
  input:
    n => n-grams
    tok_k => top k occurence of n-gram
    panda_series => panda_column to get n-gram (each row is a list)
  return:
      ngram => top k most occured n-gram(pandas_series)
  """
  grams =[]
  for i,rw in pandas_series.items():
    grams.extend(nltk.ngrams(rw, n))
    #if i%100000==0:
     # print("{0:.2g}% completed".format(i/pandas_series.shape[0]*100))
  return (pd.Series(grams).value_counts())[:top_k]
  

In [None]:
def plot_ngram_hist(most_occurence_ngram_series):
  """
  input:
    most_occurence_ngram_series =>most occured ngram(pandas_series)
  return(void)
    plot n-gram histogram
  """
  #len of n-gram
  n = len(most_occurence_ngram_series.first_valid_index())
  most_occurence_ngram_series.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
  plt.title('{} Most Frequently Occuring {}-grams'.format(most_occurence_ngram_series.shape[0],n))
  plt.ylabel('{}-gram'.format(n))
  plt.xlabel('# of Occurances')  

##Questions statistics

###Questions Title statistics

In [None]:
#1-grams
%%time
top_grams = n_gram(1,ques["TitleClean"])
plot_ngram_hist(top_grams)

In [None]:
#2-grams
%%time
top_grams = n_gram(2,ques["TitleClean"])
plot_ngram_hist(top_grams)

In [None]:
#3-grams
%%time
top_grams = n_gram(3,ques["TitleClean"])
plot_ngram_hist(top_grams)

In [None]:
#4-grams
%%time
top_grams = n_gram(4,ques["TitleClean"])
plot_ngram_hist(top_grams)

In [None]:
print("number of Questions :" ,ques.shape[0])

In [None]:
ques["TitleLen"]= ques["TitleClean"].apply(len)

In [None]:
ques["TitleLen"].iplot(
    kind='hist',
    bins=50,
    xTitle='Number of words in Title',
    linecolor='black',
    yTitle='count',
    title='Number of words in Title')

In [None]:
print("Mean of word in Title ",ques["TitleLen"].mean())

In [None]:
print("Min of word in Title ",ques["TitleLen"].min())

In [None]:
print("Max of word in Title ",ques["TitleLen"].max())

###Questions BodyText statistics

In [None]:
#1-grams
%%time
top_grams = n_gram(1,ques["TextClean"])
plot_ngram_hist(top_grams)

In [None]:
#2-grams
%%time
top_grams = n_gram(2,ques["TextClean"])
plot_ngram_hist(top_grams)

In [None]:
#3-grams
%%time
top_grams = n_gram(3,ques["TextClean"])
plot_ngram_hist(top_grams)

In [None]:
ques["TextLen"]= ques["TextClean"].apply(len)

In [None]:
ques[ques['TextLen']<500]['TextLen'].iplot(
    kind='hist',
    bins=50,
    xTitle='Number of English words in Question Body',
    linecolor='black',
    yTitle='count',
    title='Number of words in Body < 500')

In [None]:
ques[ques['TextLen']>500]['TextLen'].iplot(
    kind='hist',
    bins=100,
    xTitle='Number of English words in Question Body',
    linecolor='black',
    yTitle='count',
    title='Number of words in Body >500')

In [None]:
print("Mean of word in Title ",ques["TextLen"].mean())

In [None]:
print("Min of word in Title ",ques["TextLen"].min())

In [None]:
print("Min of word in Title ",ques["TextLen"].max())

###Questions Score statistics

In [None]:
per = ques[ques['Score']<0].shape[0]/ques['Score'].shape[0] *100
print("persantage of questions with negative score: {0:.2g}%".format(per))

In [None]:
per = ques[(ques['Score']>=0)&(ques['Score']<=5)].shape[0]/ques['Score'].shape[0] *100
print("persantage of questions with score from 0-5: {0:.2g}%".format(per))

In [1]:
per = ques[ques['Score']==0].shape[0]/ques['Score'].shape[0] *100
print("persantage of questions with score equal 0: {0:.2g}%".format(per))

NameError: name 'ques' is not defined

In [9]:
import pickle 
o = pickle.load(open("corrected_words.pkl", "rb"))

'help'

In [3]:
import pandas as pd
df = pd.read_csv("CS4.csv")

In [5]:
df.head()

Unnamed: 0,طابع زمني,Name,phone,project Name,E-mail,Academic Supervisor,Assisting Supervisor,Title (your field),tools
0,2021/07/10 12:20:54 ص غرينتش+2,Mohamed ahmed mousa,1012607079,Tourguide(AR),,dr:mohamed elzewaidy,eng:esraa ezzat,flutter developer,AR
1,2021/07/10 12:41:01 ص غرينتش+2,mohamed gamal,1111943712,Tour guide,mohamedgee25@gmail.com,Dr.mohamedelzewaidy,Eng.esraa ezzat,Unity(agmunted reality),Unity \nRevit
