In [48]:
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re
import string

In [25]:
gpt_posts = pd.read_csv("../data/raw/so_tag_gpt.csv")
chat_gpt_posts = pd.read_csv("../data/raw/so_tag_chatgpt.csv")

In [26]:
chat_gpt_posts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 259 entries, 0 to 258
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Post Link         259 non-null    int64  
 1   PostTypeId        259 non-null    int64  
 2   OwnerUserId       259 non-null    int64  
 3   Answer Link       43 non-null     float64
 4   Title             259 non-null    object 
 5   Body              259 non-null    object 
 6   CreationDate      259 non-null    object 
 7   ClosedDate        13 non-null     object 
 8   LastEditDate      133 non-null    object 
 9   LastActivityDate  259 non-null    object 
 10  Tags              259 non-null    object 
 11  AnswerCount       259 non-null    int64  
 12  CommentCount      259 non-null    int64  
 13  Score             259 non-null    int64  
 14  ViewCount         259 non-null    int64  
 15  FavoriteCount     0 non-null      float64
 16  PostTypeId.1      43 non-null     float64
 1

In [27]:
so_posts = pd.concat([gpt_posts, chat_gpt_posts], ignore_index=False)
df = so_posts.drop_duplicates(subset='Post Link', keep='first')

In [28]:
df.reset_index(drop=True, inplace=True)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 687 entries, 0 to 686
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Post Link         687 non-null    int64  
 1   PostTypeId        687 non-null    int64  
 2   OwnerUserId       683 non-null    float64
 3   Answer Link       149 non-null    float64
 4   Title             687 non-null    object 
 5   Body              687 non-null    object 
 6   CreationDate      687 non-null    object 
 7   ClosedDate        18 non-null     object 
 8   LastEditDate      368 non-null    object 
 9   LastActivityDate  687 non-null    object 
 10  Tags              687 non-null    object 
 11  AnswerCount       687 non-null    int64  
 12  CommentCount      687 non-null    int64  
 13  Score             687 non-null    int64  
 14  ViewCount         687 non-null    int64  
 15  FavoriteCount     27 non-null     float64
 16  PostTypeId.1      149 non-null    float64
 1

In [17]:
df.to_csv('../data/raw/combined_data.csv', index=False)

In [30]:
df.loc[:, 'Body'] = df['Body'].astype(str).copy()

In [31]:
# Removing Code segments
for index_label, row_series in df.iterrows():
    soup = BeautifulSoup(df.at[index_label, 'Body'])
    for tag in soup.find_all(['pre', 'blockquote', 'code']):
        tag.replaceWith('')
    df.at[index_label , 'Body'] = soup.get_text()

In [32]:
df['Body']

0      I am trying to get the response from my gpt ap...
1      I am trying to fine-tune the GPT model, and fo...
2      I am trying to integrate the openAi API model ...
3      I've been trying to upload a json file that I ...
4      I am finetuning gpt2 on text classification wi...
                             ...                        
682    I want to create a self hosted LLM model that ...
683    How do i add memory to RetrievalQA.from_chain_...
684    I'm trying to use ChatGPT for my Telegram bot....
685    When I use  parameter on https://api.openai.co...
686    I have fine-tuned an  language model () and wa...
Name: Body, Length: 687, dtype: object

In [33]:
# Clean Text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"\'\n", " ", text)
    text = re.sub(r"\r", " ", text)
    text = re.sub(r"<td>", " ", text)
    text = re.sub(r"</td>", " ", text)
    text = re.sub(r"<tr>", " ", text)
    text = re.sub(r"</tr>", " ", text)
    text = re.sub(r"\'\xa0", " ", text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

df.loc[:, 'Body'] = df['Body'].apply(lambda x: clean_text(x))

In [34]:
# Removing URLs
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)
df.loc[:, 'Body'] = df['Body'].apply(lambda x: remove_url(x))

In [35]:
df['Body']

0      i am trying to get the response from my gpt ap...
1      i am trying to fine-tune the gpt model, and fo...
2      i am trying to integrate the openai api model ...
3      i have been trying to upload a json file that ...
4      i am finetuning gpt2 on text classification wi...
                             ...                        
682    i want to create a self hosted llm model that ...
683    how do i add memory to retrievalqa.from_chain_...
684    i am trying to use chatgpt for my telegram bot...
685    when i use parameter on  the memory is not per...
686    i have fine-tuned an language model () and was...
Name: Body, Length: 687, dtype: object

In [36]:
# Removing HTML Tags
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
df.loc[:, 'Body'] = df['Body'].apply(lambda x: remove_html(x))

In [37]:
df['Body']

0      i am trying to get the response from my gpt ap...
1      i am trying to fine-tune the gpt model, and fo...
2      i am trying to integrate the openai api model ...
3      i have been trying to upload a json file that ...
4      i am finetuning gpt2 on text classification wi...
                             ...                        
682    i want to create a self hosted llm model that ...
683    how do i add memory to retrievalqa.from_chain_...
684    i am trying to use chatgpt for my telegram bot...
685    when i use parameter on  the memory is not per...
686    i have fine-tuned an language model () and was...
Name: Body, Length: 687, dtype: object

In [38]:
df.Body

0      i am trying to get the response from my gpt ap...
1      i am trying to fine-tune the gpt model, and fo...
2      i am trying to integrate the openai api model ...
3      i have been trying to upload a json file that ...
4      i am finetuning gpt2 on text classification wi...
                             ...                        
682    i want to create a self hosted llm model that ...
683    how do i add memory to retrievalqa.from_chain_...
684    i am trying to use chatgpt for my telegram bot...
685    when i use parameter on  the memory is not per...
686    i have fine-tuned an language model () and was...
Name: Body, Length: 687, dtype: object

In [39]:
# Removing Emojis
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df.loc[:, 'Body'] = df['Body'].apply(lambda x: remove_emoji(x))

In [40]:
# Removing Punctuations
from nltk.tokenize import ToktokTokenizer
token=ToktokTokenizer()
punct = '!"#$%&\'()*+,./:;<=>?@[\\]^`{|}~-'
def strip_list_noempty(mylist):
    newlist = (item.strip() if hasattr(item, 'strip') else item for item in mylist)
    return [item for item in newlist if item != '']
def clean_punct(text): 
    words=token.tokenize(text)
    punctuation_filtered = []
    regex = re.compile('[%s]' % re.escape(punct))
    for w in words:
        if w in tags_features:
            punctuation_filtered.append(w)
        else:
            punctuation_filtered.append(regex.sub('', w))
  
    filtered_list = strip_list_noempty(punctuation_filtered)
        
    return ' '.join(map(str, filtered_list))
tags_features = pd.read_csv("../data/cleaned/freq_words.csv", usecols = ['Tag'])
df.loc[:, 'Body'] = df['Body'].apply(lambda x: clean_punct(x))

In [41]:
df['Body']

0      i am trying to get the response from my gpt ap...
1      i am trying to finetune the gpt model and for ...
2      i am trying to integrate the openai api model ...
3      i have been trying to upload a json file that ...
4      i am finetuning gpt2 on text classification wi...
                             ...                        
682    i want to create a self hosted llm model that ...
683    how do i add memory to retrievalqafrom_chain_t...
684    i am trying to use chatgpt for my telegram bot...
685    when i use parameter on the memory is not pers...
686    i have finetuned an language model and was abl...
Name: Body, Length: 687, dtype: object

In [49]:
# Removing Stop Words
from nltk.corpus import stopwords
#stop = stopwords.words('english')
stop = ['after', 'afterwards','again','against', 'all', 'almost','alone','along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides',
           'between',
           'beyond',
           'bill',
           'both',
           'bottom',
           'but',
           'by',
           'call',
           'can',
           'cannot',
           'cant',
           'co',
           'con',
           'could',
           'couldnt',
           'cry',
           'de',
           'describe',
           'detail',
           'do',
           'done',
           'down',
           'due',
           'during',
           'each',
           'eg',
           'eight',
           'either',
           'eleven',
           'else',
           'elsewhere',
           'empty',
           'enough',
           'etc',
           'even',
           'ever',
           'every',
           'everyone',
           'everything',
           'everywhere',
           'except',
           'few',
           'fifteen',
           'fifty',
           'fill',
           'find',
           'fire',
           'first',
           'five',
           'for',
           'former',
           'formerly',
           'forty',
           'found',
           'four',
           'from',
           'front',
           'full',
           'further',
           'get',
           'give',
           'go',
           'had',
           'has',
           'have',
           'he',
           'hence',
           'her',
           'here',
           'hereafter',
           'hereby',
           'herein',
           'hereupon',
           'hers',
           'herself',
           'him',
           'himself',
           'his',
           'how',
           'however',
           'hundred',
           'i',
           'ie',
           'if',
           'in',
           'inc',
           'indeed',
           'interest',
           'into',
           'is',
           'it',
           'its',
           'itself',
           'keep',
           'last',
           'latter',
           'latterly',
           'ltd',
           'made',
           'many',
           'may',
           'me',
           'meanwhile',
           'might',
           'mill',
           'mine',
           'more',
           'moreover',
           'most',
           'mostly',
           'move',
           'much',
           'must',
           'my',
           'myself',
           'name',
           'namely',
           'next',
           'nine',
           'of',
           'off',
           'often',
           'on',
           'once',
           'one',
           'only',
           'onto',
           'or',
           'other',
           'others',
           'otherwise',
           'our',
           'ours',
           'ourselves',
           'out',
           'over',
           'own',
           'part',
           'per',
           'perhaps',
           'please',
           'put',
           'rather',
           're',
           'same',
           'see',
           'seem',
           'seemed',
           'seeming',
           'seems',
           'serious',
           'several',
           'she',
           'should',
           'show',
           'side',
           'since',
           'sincere',
           'six',
           'sixty',
           'so',
           'some',
           'somehow',
           'someone',
           'something',
           'sometime',
           'sometimes',
           'somewhere',
           'still',
           'such',
           'system',
           'take',
           'ten',
           'than',
           'that',
           'the',
           'their',
           'them',
           'themselves',
           'then',
           'thence',
           'there',
           'thereafter',
           'thereby',
           'therefore',
           'therein',
           'thereupon',
           'these',
           'they',
           'thick',
           'thin',
           'third',
           'this',
           'those',
           'though',
           'three',
           'through',
           'throughout',
           'thru',
           'thus',
           'to',
           'together',
           'too',
           'top',
           'toward',
           'towards',
           'twelve',
           'twenty',
           'two',
           'un',
           'under',
           'until',
           'up',
           'upon',
           'us',
           'very',
           'via',
           'was',
           'we',
           'well',
           'were',
           'what',
           'whatever',
           'when',
           'whence',
           'whenever',
           'where',
           'whereafter',
           'whereas',
           'whereby',
           'wherein',
           'whereupon',
           'wherever',
           'whether',
           'which',
           'while',
           'whither',
           'who',
           'whoever',
           'whole',
           'whom',
           'whose',
           'why',
           'will',
           'with',
           'within',
           'would',
           'yet',
           'you',
           'your',
           'yours',
           'yourself',
           'yourselves']

df.loc[:, 'Body'] = df['Body'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [43]:
# Lemmatizing
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import ToktokTokenizer
token=ToktokTokenizer()
lemma=WordNetLemmatizer()
nltk.download('wordnet')

def lemitizeWords(text):
    words=token.tokenize(text)
    listLemma=[]
    for w in words:
        x=lemma.lemmatize(w, pos="v")
        listLemma.append(x)
    return ' '.join(map(str, listLemma))

df.loc[:, 'Body'] = df['Body'].apply(lambda x: lemitizeWords(x)) 

[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [44]:
df['Body']

0      try response gpt api word word like chatgpt ge...
1      try finetune gpt model 3 columns context quest...
2      try integrate openai api model terminal enable...
3      try upload a json file use fine tune gpt3 mode...
4      finetuning gpt2 text classification huggingfac...
                             ...                        
682    want create a self host llm model able a conte...
683    add memory retrievalqafrom_chain_type add a cu...
684    try use chatgpt telegram bot use use textdavin...
685    use parameter memory not persist across multip...
686    finetuned language model able access model met...
Name: Body, Length: 687, dtype: object

In [45]:
# Removing numbers
number_pattern = r'[0-9]'
df.loc[:, 'Body'] = df['Body'].apply(lambda x: re.sub(number_pattern, '', x)) 

In [46]:
df['Body']

0      try response gpt api word word like chatgpt ge...
1      try finetune gpt model  columns context questi...
2      try integrate openai api model terminal enable...
3      try upload a json file use fine tune gpt model...
4      finetuning gpt text classification huggingface...
                             ...                        
682    want create a self host llm model able a conte...
683    add memory retrievalqafrom_chain_type add a cu...
684    try use chatgpt telegram bot use use textdavin...
685    use parameter memory not persist across multip...
686    finetuned language model able access model met...
Name: Body, Length: 687, dtype: object

In [47]:
df.to_csv('../data/cleaned/cleaned_data_second.csv', index=False)