# Textmining 

## Setup

In [30]:
import pandas as pd
import altair as alt
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np

import warnings

In [31]:
# ignore warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Data
### Data Import

In [32]:
df = pd.read_csv("https://raw.githubusercontent.com/louisawalter/DS-Homework1/main/data/raw/webscraping.csv",index_col=0)

In [33]:
df

Unnamed: 0,Titel,URL,Thema,Autor,Datum,Artikeltext,Tags
0,The Midterms Made State Governments Bluer,https://fivethirtyeight.com/features/2022-gove...,2022 Election,Nathaniel Rakich,"Nov. 17, 2022, at 6:00 AM","Abortion bans, right-to-work laws, voting rest...","2022 Election, 2022 Midterms, 2022 Governors E..."


In [34]:
df["Artikeltext_lower"] = df["Artikeltext"].astype(str).str.lower()

In [35]:
df

Unnamed: 0,Titel,URL,Thema,Autor,Datum,Artikeltext,Tags,Artikeltext_lower
0,The Midterms Made State Governments Bluer,https://fivethirtyeight.com/features/2022-gove...,2022 Election,Nathaniel Rakich,"Nov. 17, 2022, at 6:00 AM","Abortion bans, right-to-work laws, voting rest...","2022 Election, 2022 Midterms, 2022 Governors E...","abortion bans, right-to-work laws, voting rest..."


In [36]:
regexp = RegexpTokenizer('\w+') # use regular expression to match (multiple) word characters and numbers

df['Artikeltext_token']=df['Artikeltext_lower'].apply(regexp.tokenize) # insert the data column and the regular expression pattern

In [37]:

# Make a list of english stopwords
stopwords = nltk.corpus.stopwords.words("english")

In [38]:
my_stopwords = ['https', 'co']
stopwords.extend(my_stopwords)

In [39]:
df['Artikeltext_token_s'] = df['Artikeltext_token'].apply(lambda x: [i for i in x if i not in stopwords])

In [40]:
df

Unnamed: 0,Titel,URL,Thema,Autor,Datum,Artikeltext,Tags,Artikeltext_lower,Artikeltext_token,Artikeltext_token_s
0,The Midterms Made State Governments Bluer,https://fivethirtyeight.com/features/2022-gove...,2022 Election,Nathaniel Rakich,"Nov. 17, 2022, at 6:00 AM","Abortion bans, right-to-work laws, voting rest...","2022 Election, 2022 Midterms, 2022 Governors E...","abortion bans, right-to-work laws, voting rest...","[abortion, bans, right, to, work, laws, voting...","[abortion, bans, right, work, laws, voting, re..."


### Transform data and remove infrequent words

In [41]:
df['Artikeltext_si'] = df['Artikeltext_token_s'].apply(lambda x: ' '.join([i for i in x if len(i)>2]))

### Lemmatization

In [42]:
# create an object called wordnet_lem of the WordNetLemmatizer() function.
wordnet_lem = WordNetLemmatizer()

In [43]:
# create a new column called text_sil (l for lemmatization) and apply the function .lemmatize
df['Artikeltext_sil'] = df['Artikeltext_si'].apply(wordnet_lem.lemmatize)

# we check wether there are any differences in the two columns
check_difference = (df['Artikeltext_sil'] == df['Artikeltext_sil'])

# sum all True and False values
check_difference.value_counts()

True    1
Name: Artikeltext_sil, dtype: int64