In [1]:
from bs4 import BeautifulSoup
import spacy
import unidecode
from word2number import w2n
from pycontractions import Contractions
import gensim.downloader as api
import en_core_web_sm
import re
import pandas as pd

In [2]:
nlp = en_core_web_sm.load()

# Choose model accordingly for contractions function
model = api.load("glove-twitter-25")
# model = api.load("glove-twitter-100")
# model = api.load("word2vec-google-news-300")

cont = Contractions(kv_model=model)
cont.load_models()

# exclude words from spacy stopwords list
deselect_stop_words = ['no', 'not']
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False


def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text


def remove_whitespace(text):
    """remove extra whitespaces from text"""
    text = text.strip()
    return " ".join(text.split())


def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text


def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    text = list(cont.expand_texts([text], precise=True))[0]
    return text


def text_preprocessing(text, accented_chars=True, contractions=True, 
                       convert_num=True, extra_whitespace=True, 
                       lemmatization=True, lowercase=True, punctuations=False,
                       remove_html=True, remove_num=False, special_chars=False, 
                       stop_words=True):
    """preprocess text with default option set to true for all steps"""
    if remove_html == True: #remove html tags
        text = strip_html_tags(text)
    if extra_whitespace == True: #remove extra whitespaces
        text = remove_whitespace(text)
    if accented_chars == True: #remove accented characters
        text = remove_accented_chars(text)
    if contractions == True: #expand contractions
        text = expand_contractions(text)
    if lowercase == True: #convert all characters to lowercase
        text = text.lower()
    
    # return text

    doc = nlp(text) #tokenise text

    clean_text = []
    
    for token in doc:
        flag = True
        edit = token.text
        # remove stop words
        if stop_words == True and token.is_stop and token.pos_ != 'NUM': 
            flag = False
        # remove punctuations
        if punctuations == True and token.pos_ == 'PUNCT' and flag == True: 
            flag = False
        # remove special characters
        if special_chars == True and token.pos_ == 'SYM' and flag == True: 
            flag = False
        # remove numbers
        if remove_num == True and (token.pos_ == 'NUM' or token.text.isnumeric()) \
        and flag == True:
            flag = False
        # convert number words to numeric numbers
        if convert_num == True and token.pos_ == 'NUM' and flag == True:
            try:
                edit = w2n.word_to_num(token.text)
            except:
                a=1
        # convert tokens to base form
        elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        # append tokens edited and not removed to list 
        if edit != "" and flag == True:
            clean_text.append(edit)        
    return clean_text

In [3]:
def listToString(s):  
    
    # initialize an empty string 
    str1 = ""  
    
    # traverse in the string   
    for ele in s:  
        str1 += str(ele) + ' '  
    
    # return string   
    return str1  

In [4]:
data_dir = '/home/ckm/visualqatickets/superuser/Data/raw_pandas'

In [5]:
with open(data_dir+'/superuser_nosw.json','r') as a:
    qr = pd.read_json(a,orient='table')

In [6]:
with open(data_dir+'/sup_linked_fin.json','r') as a:
    ln = pd.read_json(a,orient='table')

In [7]:
linked_list = []
for i in qr.index.values:
    if(i in ln.index.values):
        linked_list.append(ln.loc[i,'Linked'])
    else:
        linked_list.append([])

In [8]:
qr['Linked'] = linked_list

In [9]:
qr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59341 entries, 1 to 1474607
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   AcceptedAnswerId  28701 non-null  float64
 1   AnswerCount       59341 non-null  float64
 2   AnswerIds         59341 non-null  object 
 3   Attachments       13683 non-null  object 
 4   OS                13683 non-null  object 
 5   PostTypeId        59341 non-null  int64  
 6   Related           59341 non-null  object 
 7   Score             59341 non-null  int64  
 8   Tags              59341 non-null  object 
 9   Text              59341 non-null  object 
 10  Title             59341 non-null  object 
 11  Linked            59341 non-null  object 
dtypes: float64(2), int64(2), object(8)
memory usage: 5.9+ MB


In [10]:
for i in ln.index.values:
    if((i in qr.index.values)==0):
        print(f'Currently pre-processing id {i}')
        qr.loc[i] = ln.loc[i]
        qr.loc[i,'Text'] = listToString(text_preprocessing(qr.loc[i,'Text']))
        qr.loc[i,'Title'] = listToString(text_preprocessing(qr.loc[i,'Title']))

0.0
Currently pre-processing id 201468.0
Currently pre-processing id 962076.0
Currently pre-processing id 1039385.0
Currently pre-processing id 328291.0
Currently pre-processing id 51967.0
Currently pre-processing id 479422.0
Currently pre-processing id 1526545.0
Currently pre-processing id 1473659.0
Currently pre-processing id 1605871.0
Currently pre-processing id 307729.0
Currently pre-processing id 763196.0
Currently pre-processing id 1184092.0
Currently pre-processing id 1052951.0
Currently pre-processing id 631708.0
Currently pre-processing id 671085.0
Currently pre-processing id 113709.0
Currently pre-processing id 470022.0
Currently pre-processing id 368891.0
Currently pre-processing id 1063400.0
Currently pre-processing id 1238372.0
Currently pre-processing id 639334.0
Currently pre-processing id 1354167.0
Currently pre-processing id 1433477.0
Currently pre-processing id 1220144.0
Currently pre-processing id 633060.0
Currently pre-processing id 310681.0
Currently pre-processing

In [13]:
os.makedirs('/home/ckm/visualqatickets/superuser/Data/nosw',exist_ok=True)

In [14]:
qr.to_json('/home/ckm/visualqatickets/superuser/Data/nosw/qr.json',orient='table')