# Textmining

---

Group name: Group C

---


## Setup

In [1]:
# we suppress some unimportant warnings
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import pandas as pd

## Data

In [3]:
#import csv from company article
df = pd.read_csv('/Users/Kim/Library/Mobile Documents/com~apple~CloudDocs/UNI/HdM/Semester7/Data Science Python/DS-Homework1/data/raw/webscraping02.csv')

In [4]:
df

Unnamed: 0.1,Unnamed: 0,Titel,URL,Thema,Autor,Datum,Artikeltext
0,0,Inside Twitter as ‘mass exodus’ of staff...,https://edition.cnn.com/2022/11/17/media/twitt...,New YorkCNN Business —,Oliver Darcy,"Updated 10:35 AM EST, Fri November 18...",New YorkCNN Business — Death i...


## Data correction

In [5]:
df['Artikeltext_lower'] = df['Artikeltext'].astype(str).str.lower()

df.head(3)

Unnamed: 0.1,Unnamed: 0,Titel,URL,Thema,Autor,Datum,Artikeltext,Artikeltext_lower
0,0,Inside Twitter as ‘mass exodus’ of staff...,https://edition.cnn.com/2022/11/17/media/twitt...,New YorkCNN Business —,Oliver Darcy,"Updated 10:35 AM EST, Fri November 18...",New YorkCNN Business — Death i...,new yorkcnn business — death i...


## Text mining data preparation

In [6]:
from nltk.tokenize import RegexpTokenizer

In [7]:
# YOUR CODE HERE
regexp = RegexpTokenizer('\w+') # use regular expression to match (multiple) word characters and numbers

df['Artikeltext_token']=df['Artikeltext_lower'].apply(regexp.tokenize) # insert the data column and the regular expression pattern

df

Unnamed: 0.1,Unnamed: 0,Titel,URL,Thema,Autor,Datum,Artikeltext,Artikeltext_lower,Artikeltext_token
0,0,Inside Twitter as ‘mass exodus’ of staff...,https://edition.cnn.com/2022/11/17/media/twitt...,New YorkCNN Business —,Oliver Darcy,"Updated 10:35 AM EST, Fri November 18...",New YorkCNN Business — Death i...,new yorkcnn business — death i...,"[new, yorkcnn, business, death, is, in, the, a..."


In [8]:
import nltk
from nltk.corpus import stopwords

In [9]:
# Make a list of english stopwords
stopwords = nltk.corpus.stopwords.words("english")

In [10]:
# make your own custom stopwords
my_stopwords = ['https', 'co']

In [11]:
# Extend the stopword list with your own custom stopwords
stopwords.extend(my_stopwords)

In [12]:
df['Artikeltext_token_s'] = df['Artikeltext_token'].apply(lambda x: [i for i in x if i not in stopwords])

df

Unnamed: 0.1,Unnamed: 0,Titel,URL,Thema,Autor,Datum,Artikeltext,Artikeltext_lower,Artikeltext_token,Artikeltext_token_s
0,0,Inside Twitter as ‘mass exodus’ of staff...,https://edition.cnn.com/2022/11/17/media/twitt...,New YorkCNN Business —,Oliver Darcy,"Updated 10:35 AM EST, Fri November 18...",New YorkCNN Business — Death i...,new yorkcnn business — death i...,"[new, yorkcnn, business, death, is, in, the, a...","[new, yorkcnn, business, death, air, twitter, ..."


### Transform data and remove infrequent words

In [13]:
df['Artikeltext_si'] = df['Artikeltext_token_s'].apply(lambda x: ' '.join([i for i in x if len(i)>2]))

df

Unnamed: 0.1,Unnamed: 0,Titel,URL,Thema,Autor,Datum,Artikeltext,Artikeltext_lower,Artikeltext_token,Artikeltext_token_s,Artikeltext_si
0,0,Inside Twitter as ‘mass exodus’ of staff...,https://edition.cnn.com/2022/11/17/media/twitt...,New YorkCNN Business —,Oliver Darcy,"Updated 10:35 AM EST, Fri November 18...",New YorkCNN Business — Death i...,new yorkcnn business — death i...,"[new, yorkcnn, business, death, is, in, the, a...","[new, yorkcnn, business, death, air, twitter, ...",new yorkcnn business death air twitter platfor...


### Lemmatization

In [14]:
from nltk.stem import WordNetLemmatizer

In [15]:
wordnet_lem = WordNetLemmatizer()

In [16]:
# create a new column called text_sil (l for lemmatization) and apply the function .lemmatize
df['Artikeltext_sil'] = df['Artikeltext_si'].apply(wordnet_lem.lemmatize)

# we check wether there are any differences in the two columns
check_difference = (df['Artikeltext_sil'] == df['Artikeltext_sil'])

# sum all True and False values
check_difference.value_counts()

True    1
Name: Artikeltext_sil, dtype: int64

## Sentiment

In [17]:
from nltk.sentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

In [18]:
df['polarity'] = df['Artikeltext_sil'].apply(lambda x: analyzer.polarity_scores(x))

### Transform data

In [19]:
# Change data structure (we unnest the column polarity and add new columns)
df = pd.concat([df.drop(['polarity'], axis=1), df['polarity'].apply(pd.Series)], axis=1)

In [20]:
df['sentiment'] = df['compound'].apply(lambda x: 'positive' if x > 0 else 'neutral' if x == 0 else 'negative')

In [21]:
# Tweet with highest positive sentiment
df[['Artikeltext', 'compound', 'neg', 'neu', 'pos', 'sentiment']].loc[df['compound'].idxmax()]

Artikeltext    New YorkCNN Business        —          Death i...
compound                                                 -0.8847
neg                                                        0.125
neu                                                         0.76
pos                                                        0.115
sentiment                                               negative
Name: 0, dtype: object

In [22]:
# Tweet with highest negative sentiment 
# ...seems to be a case of wrong classification because of the word "deficit"
df[['Artikeltext', 'compound', 'neg', 'neu', 'pos', 'sentiment']].loc[df['compound'].idxmin()]

Artikeltext    New YorkCNN Business        —          Death i...
compound                                                 -0.8847
neg                                                        0.125
neu                                                         0.76
pos                                                        0.115
sentiment                                               negative
Name: 0, dtype: object

## Visualize Data

In [23]:
import altair as alt

# create data to change colors in Altair plot
domain = ['neutral', 'positive', 'negative']
range_=['#b2d8d8',"#008080", '#db3d13']


alt.Chart(df).mark_bar().encode(
    x=alt.X('count()', title=None),
    y=alt.Y('sentiment', sort="-x"),
    color= alt.Color('sentiment', legend=None, scale=alt.Scale(domain=domain, range=range_))
).properties(
    title="Sentiment analysis",
    width=400,
    height=150,
)

In [24]:
# Function to add date variables to DataFrame.
def add_date_info(df):
  df['created_at'] = pd.to_datetime(df['created_at'], unit='ns')
  df['Year'] = pd.DatetimeIndex(df['created_at']).year
  df['Month'] = pd.DatetimeIndex(df['created_at']).month
  df['Day'] = pd.DatetimeIndex(df['created_at']).day
  df['DOY'] = pd.DatetimeIndex(df['created_at']).dayofyear
  df['Date'] = pd.DatetimeIndex(df['created_at']).date
  return df

In [25]:
alt.Chart(df).mark_area().encode(
   x=alt.X('Date', axis=alt.Axis(format='%e.%-m.')),
   y=alt.Y('count(sentiment)'),
   color=alt.Color('sentiment', scale=alt.Scale(domain=domain, range=range_))
)

ValueError: Date encoding field is specified without a type; the type cannot be inferred because it does not match any column in the data.

alt.Chart(...)

In [None]:
alt.Chart(df).mark_boxplot().encode(
    x=alt.X('sentiment'),
    y=alt.Y('compound'),
    color=alt.Color('sentiment', scale=alt.Scale(domain=domain, range=range_))
).properties(
    width=200,
    height=200
)

In [None]:
df.to_csv("/Users/Kim/Library/Mobile Documents/com~apple~CloudDocs/UNI/HdM/Semester7/Data Science Python/DS-Homework1/data/processed/textmining-clean-cnn.csv", index=None)