# Imports

In [1]:
import json
import pandas as pd
import matplotlib as mp
# for language detection
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from pathlib import Path
import trafilatura

# Functions and classes

In [2]:
# import data
filepath = "../../data/raw/mainpipe_data_v1.jsonl"
data = []

with open(filepath, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError:
            continue

df = pd.DataFrame(data)
print(df)

                                                     text  \
0       In the never ending battle to rid Alaska of it...   
1       » Jackpot | Deutsche Online Casinos und Casino...   
2       This really was an unexpected pleasure. When I...   
3       def files(self):\n        """Files in torrent....   
4       Patient engagement in the design and delivery ...   
...                                                   ...   
269373  Our 1 to 1 Karting lessons are ideal to give y...   
269374  function read(model) {\n  var query = argument...   
269375  In a land that is already fragile with earthqu...   
269376                                  Simple, YES on 8!   
269377  <p>How would I be able to get N results for se...   

                                                      url  
0                                                    None  
1         http://www.casinodeutsch.net/stichwort/jackpot/  
2       http://leekat.booklikes.com/post/608842/an-une...  
3       https://github.com/

In [7]:
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "Unknown"
    
def clean_html_trafilatura(text):
    """
    Using trafilatura library clean html elements incl tags and JS
    """
    # MA to check what happens in next line with regular string
    extracted = trafilatura.extract(text)
    return extracted if extracted else text

class utf8cleaning:
    """
    This is a cleaning step to standardise text to utf-8 across the df
    """

    def run(self, df):
        # standardise encoding to utf-8 across the dataframe
        df['text'] = df['text'].apply(lambda x: str(x).encode('utf-8', 'ignore').decode())
        return df

class nullcleaning:
    def run(self, df):
        # remove any na's
        df = df[df['text'].notna()]
        return df

class lanugagecleaning:
    def run(self, df):
        # assign languages to unknowns then filter them out
        df['language'] = df['text'].apply(detect_language)

        # store removed rows
        df = df[df['language'] != 'Unknown']

        return df
    

# Remove any NA's in text

In [8]:
df = df[df['text'].notna()]

# UTF - 8 encoding

In [4]:
df['text'] = df['text'].apply(lambda x: str(x).encode('utf-8', 'ignore').decode())

# HTML Cleaning

In [5]:
# there are many html elements including javascript and tags in the text data,
# use trafilatura library
df["text"] = df["text"].apply(clean_html_trafilatura)

In [6]:
df.head(30)

Unnamed: 0,text,url
0,In the never ending battle to rid Alaska of it...,
1,» Jackpot | Deutsche Online Casinos und Casino...,http://www.casinodeutsch.net/stichwort/jackpot/
2,This really was an unexpected pleasure. When I...,http://leekat.booklikes.com/post/608842/an-une...
3,"def files(self):\n """"""Files in torrent....",https://github.com/idlesign/torrentool/blob/78...
4,Patient engagement in the design and delivery ...,http://www.nhlc-cnls.ca/sessions/3/
5,function fallBack () {\n var programFiles...,https://github.com/mozilla-jetpack/node-fx-run...
6,"Whether you’re looking for a new Nissan Rogue,...",https://www.puyallupnissan.com/value-your-trade/
7,Ubuntu 9.10\nSilly question time: When the cla...,
8,"Actually, if during the commission of a felony...",
9,Bernadine brings to the agency over twenty yea...,https://www.4rfv.com/V7BTWGJ6BUYI/AA/bernadine...


# Filtering code (specifically github)

In [None]:
# isolate code from github
df[]

# Text normalisation

In [None]:
# remove URLS
# remove unusual characters
# remove stop words and irrellevant content

# Languages

In [9]:
# Looking at the languages in the text data using langdetect
# returning unknown
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "Unknown"

In [10]:

# assign language to each row
df['language'] = df['text'].apply(detect_language)

KeyboardInterrupt: 

In [10]:
df

Unnamed: 0,text,url,language
0,In the never ending battle to rid Alaska of it...,,en
1,» Jackpot | Deutsche Online Casinos und Casino...,http://www.casinodeutsch.net/stichwort/jackpot/,de
2,This really was an unexpected pleasure. When I...,http://leekat.booklikes.com/post/608842/an-une...,en
3,"def files(self):\n """"""Files in torrent....",https://github.com/idlesign/torrentool/blob/78...,en
4,Patient engagement in the design and delivery ...,http://www.nhlc-cnls.ca/sessions/3/,en
...,...,...,...
269373,Our 1 to 1 Karting lessons are ideal to give y...,https://midlandkarting.co.uk/go-karting-events...,en
269374,function read(model) {\n var query = argument...,https://github.com/endpoints/endpoints/blob/1e...,en
269375,In a land that is already fragile with earthqu...,,en
269376,"Simple, YES on 8!",,en


In [12]:
# filter out unknown lanaugage
df= df[df['language'] != 'Unknown']

In [6]:
df

Unnamed: 0,text,url
0,In the never ending battle to rid Alaska of it...,
1,» Jackpot | Deutsche Online Casinos und Casino...,http://www.casinodeutsch.net/stichwort/jackpot/
2,This really was an unexpected pleasure. When I...,http://leekat.booklikes.com/post/608842/an-une...
3,"def files(self):\n """"""Files in torrent....",https://github.com/idlesign/torrentool/blob/78...
4,Patient engagement in the design and delivery ...,http://www.nhlc-cnls.ca/sessions/3/
...,...,...
269373,Our 1 to 1 Karting lessons are ideal to give y...,https://midlandkarting.co.uk/go-karting-events...
269374,function read(model) {\n var query = argument...,https://github.com/endpoints/endpoints/blob/1e...
269375,In a land that is already fragile with earthqu...,
269376,"Simple, YES on 8!",


# Filtering based on text lengths