In [None]:
!pip install unidecode
!pip install word2number
!pip install gensim

In [None]:
! sudo apt install openjdk-8-jdk
! sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
! pip install language-check
! pip install pycontractions

In [None]:
from bs4 import BeautifulSoup
import spacy
import unidecode
from word2number import w2n
from pycontractions import Contractions
import gensim.downloader as api
import en_core_web_sm
import re

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
nlp = en_core_web_sm.load()

# Choose model accordingly for contractions function
model = api.load("glove-twitter-25")
# model = api.load("glove-twitter-100")
# model = api.load("word2vec-google-news-300")

cont = Contractions(kv_model=model)
cont.load_models()

# exclude words from spacy stopwords list
deselect_stop_words = ['no', 'not']
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False


def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text


def remove_whitespace(text):
    """remove extra whitespaces from text"""
    text = text.strip()
    return " ".join(text.split())


def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text


def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    text = list(cont.expand_texts([text], precise=True))[0]
    return text


def text_preprocessing(text, accented_chars=True, contractions=True, 
                       convert_num=True, extra_whitespace=True, 
                       lemmatization=True, lowercase=True, punctuations=False,
                       remove_html=True, remove_num=False, special_chars=False, 
                       stop_words=True):
    """preprocess text with default option set to true for all steps"""
    if remove_html == True: #remove html tags
        text = strip_html_tags(text)
    if extra_whitespace == True: #remove extra whitespaces
        text = remove_whitespace(text)
    if accented_chars == True: #remove accented characters
        text = remove_accented_chars(text)
    if contractions == True: #expand contractions
        text = expand_contractions(text)
    if lowercase == True: #convert all characters to lowercase
        text = text.lower()
    
    # return text

    doc = nlp(text) #tokenise text

    clean_text = []
    
    for token in doc:
        flag = True
        edit = token.text
        # remove stop words
        if stop_words == True and token.is_stop and token.pos_ != 'NUM': 
            flag = False
        # remove punctuations
        if punctuations == True and token.pos_ == 'PUNCT' and flag == True: 
            flag = False
        # remove special characters
        if special_chars == True and token.pos_ == 'SYM' and flag == True: 
            flag = False
        # remove numbers
        if remove_num == True and (token.pos_ == 'NUM' or token.text.isnumeric()) \
        and flag == True:
            flag = False
        # convert number words to numeric numbers
        if convert_num == True and token.pos_ == 'NUM' and flag == True:
            try:
                edit = w2n.word_to_num(token.text)
            except:
                print('error in converting number words to numeric numbers',token.text)
        # convert tokens to base form
        elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        # append tokens edited and not removed to list 
        if edit != "" and flag == True:
            clean_text.append(edit)        
    return clean_text



In [None]:
def listToString(s):  
    
    # initialize an empty string 
    str1 = ""  
    
    # traverse in the string   
    for ele in s:  
        str1 += str(ele) + ' '  
    
    # return string   
    return str1  

In [None]:
with open('/content/drive/MyDrive/Data/superuser/sup_linked_fin.json','r') as a:
    ln = pd.read_json(a,orient='table')

In [None]:
for i in ln.index.values:
    ln.loc[i,'Text'] = listToString(text_preprocessing(ln.loc[i,'Text'],stop_words=True))
    ln.loc[i,'Title'] = listToString(text_preprocessing(ln.loc[i,'Title'],stop_words=True))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
error in converting number words to numeric numbers 4.8
error in converting number words to numeric numbers 1.42.9
error in converting number words to numeric numbers /dev
error in converting number words to numeric numbers /dev
error in converting number words to numeric numbers /dev
error in converting number words to numeric numbers size=512
error in converting number words to numeric numbers 2.0
error in converting number words to numeric numbers 2.0
error in converting number words to numeric numbers head=255
error in converting number words to numeric numbers head=240
error in converting number words to numeric numbers head=255
error in converting number words to numeric numbers head=255
error in converting number words to numeric numbers head=240
error in converting number words to numeric numbers head=255
error in converting number words to numeric numbers 0.5
error in converting number words to numeric numbers 32

KeyboardInterrupt: ignored

In [None]:
with open('/content/drive/MyDrive/Data/superuser/sup_linked_fin_nosw.json','w') as a:
    ln.to_json(a,orient='table')

In [None]:
with open('/content/drive/MyDrive/Data/Data_Missing.txt','r',encoding='utf-8') as fi:
    s = fi.read()
    s_split = re.split('\?|\n',s)
    final_with_stopwords = ''
    final_without_stopwords = ''
    for i in range(0,len(s_split)):
        final_with_stopwords = listToString(text_preprocessing(s_split[i],stop_words=False)) + '\n'
        final_without_stopwords = listToString(text_preprocessing(s_split[i],stop_words=True)) + '\n'
        with open('/content/drive/MyDrive/Data/data_missing_with_stopwords.txt','a') as f:
            f.write(final_with_stopwords) 
        with open('/content/drive/MyDrive/Data/data_missing_without_stopwords.txt','a') as f:
            f.write(final_without_stopwords) 
    print(final_with_stopwords[:100])
    print(final_without_stopwords[:100])