# Exploratory Analysis Notebook

Python scripts for extract informations from the created features. 

Table of Content:

### 1. Import required packages


In [70]:
import pandas as pd
import numpy as np
import plotly.express as px
from collections import Counter


### 2. Import files

In [5]:
abt_covid = pd.read_pickle('C:/Users/molna/Desktop/Szakdolgozat/adatok/abt_covid_featured.pkl')

print("ABT table has {} rows and {} columns".format(len(abt_covid), len(abt_covid.columns)))

ABT table has 63633 rows and 50 columns


In [6]:
abt_covid.head(2)

Unnamed: 0,title,date,text,source,dezinf,title_word_cnt,title_avg_word,title_exclam_num,title_ques_num,title_stop_cnt,...,title_ner_pers,title_ner_orgs,title_ner_locs,text_ner_pers,text_ner_orgs,text_ner_locs,title_senti_list,title_polarity,text_senti_list,text_polarity
"""56""",4000 szexstreamer adatai kerültek nyilvánosságra,2020-01-21,A vpnMentor biztonságtechnikai kutatói 875 ez...,index.hu,0,5,8.8,0,0,0,...,[],[],[],[],"[Softpedia News., ImLive, Fotó: Shutterstock Á...","[Amazon-szerveren, ImLive-nak]","[0, 0, 0, 0]",0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",0.0
"""70""",Ukrán állampolgárok adatai szivárogtak ki egy ...,2020-01-21,Újabb kiberbiztonsági probléma Ukrajnában: a ...,index.hu,0,8,7.125,0,0,2,...,[],[],[],"[Zelenszkij, Akták Kiberbiztonság]","[Reuters., FBI, Burisma, Nemzetbiztonsági Szak...","[Ukrajnában, Ukrajnából, Kijev]","[0, 0, 0, 0, 0, 0]",0.0,"[0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...",0.0


### 3. Exploratory Analysis

### 3.1 Count of collected Covid articles by sources

In [24]:
df_count_source = abt_covid.groupby(['source'])['text'].count().reset_index().sort_values(by=["text"])

In [25]:
fig = px.bar(df_count_source, x="text", y="source", text="text", orientation='h',
            labels=dict(text="Count of documents", source="Document source"))
fig.update_traces(texttemplate='%{text:.0f}', textposition='auto', textfont_size=12)
fig.update_layout(title_text = "Count of collected documents by the sources", 
                  uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

### 3.2 Count of collected Covid articles by type
(dezinformation vs mainstream)

In [33]:
df_count_dezinf = abt_covid.groupby(['dezinf'])['text'].count().reset_index().sort_values(by=["text"])

In [38]:
df_count_dezinf["dezinf"].replace([1, 0], ["desinform", "mainstream"], inplace=True)

In [46]:
fig = px.bar(df_count_dezinf, x="text", y="dezinf", text="text", color="dezinf", orientation='h',
            labels=dict(text="Count of documents", dezinf="Document type"))
fig.update_traces(texttemplate='%{text:.0f}', textposition='auto', textfont_size=12)
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide',
                 title_text = "Count of collected documents by type")
fig.show()

In [54]:
abt_covid.columns

Index(['title', 'date', 'text', 'source', 'dezinf', 'title_word_cnt',
       'title_avg_word', 'title_exclam_num', 'title_ques_num',
       'title_stop_cnt', 'title_cnt_upper', 'text_word_cnt', 'text_avg_word',
       'text_ques_num', 'text_exclam_num', 'text_stop_cnt', 'text_cnt_upper',
       'text_cnt_num', 'title_cnt_num', 'title_cleaned', 'text_cleaned',
       'title_tokens', 'text_tokens', 'title_lemmas', 'text_lemmas',
       'title_pos', 'text_pos', 'text_unique_lemma_ratio',
       'title_cnt_unique_lemmas', 'title_stop_word_ratio',
       'text_cnt_unique_lemmas', 'text_stop_word_ratio', 'title_noun_ratio',
       'title_verb_ratio', 'title_propn_ratio', 'title_adj_ratio',
       'text_noun_ratio', 'text_verb_ratio', 'text_propn_ratio',
       'text_adj_ratio', 'title_ner_pers', 'title_ner_orgs', 'title_ner_locs',
       'text_ner_pers', 'text_ner_orgs', 'text_ner_locs', 'title_senti_list',
       'title_polarity', 'text_senti_list', 'text_polarity'],
      dtype='object')

### 3.3 Ratio of unique lemmas in the document text
(dezinformation vs mainstream)

In [63]:
fig = px.box(abt_covid, x="dezinf", y="text_unique_lemma_ratio", color="dezinf",
            labels=dict(text="Unique lemma ratio", dezinf="Document type"))
fig.update_layout(title_text = "Ratio of unique lemmas in the document text")
fig.show()

### 3.4 Top Named Entities

### 3.4.1 Top Persons in all document title

In [67]:
flat_person_list = [item for elem in abt_covid["title_ner_pers"] for item in elem]

In [71]:
per_counts = Counter(flat_person_list).most_common(15)
df_top_pers = pd.DataFrame(per_counts, columns =['person', 'count'])

In [73]:
fig = px.bar(df_top_pers, x="count", y="person", text="count", orientation='h',
            labels=dict(count="Count in documents", person="Person entity"))
fig.update_traces(texttemplate='%{text:.0f}', textposition='auto', textfont_size=12)
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide',
                 title_text = "Count of person entity in documents")
fig.show()

### 3.4.2 Top Persons in desonformation document title

In [74]:
fake_flat_person_list = [item for elem in abt_covid[abt_covid["dezinf"] == 1]["title_ner_pers"] for item in elem]

In [86]:
fake_per_counts = Counter(fake_flat_person_list).most_common(15)
fake_df_top_pers = pd.DataFrame(fake_per_counts, columns =['person', 'count'])

In [87]:
fig = px.bar(fake_df_top_pers, x="count", y="person", text="count", orientation='h',
            labels=dict(count="Count in documents", person="Person entity"))
fig.update_traces(texttemplate='%{text:.0f}', textposition='auto', textfont_size=12)
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide',
                 title_text = "Count of person entity in desinformation documents")
fig.show()

### 3.4.3 Top Persons in mainstream document title

In [80]:
mainsteam_flat_person_list = [item for elem in abt_covid[abt_covid["dezinf"] == 0]["title_ner_pers"] for item in elem]

In [82]:
mainsteam_per_counts = Counter(mainsteam_flat_person_list).most_common(15)
mainsteam_df_top_pers = pd.DataFrame(mainsteam_per_counts, columns =['person', 'count'])

In [83]:
fig = px.bar(mainsteam_df_top_pers, x="count", y="person", text="count", orientation='h',
            labels=dict(count="Count in documents", person="Person entity"))
fig.update_traces(texttemplate='%{text:.0f}', textposition='auto', textfont_size=12)
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide',
                 title_text = "Count of person entity in mainstream documents")
fig.show()

### 3.4.1 Top Locations in all document title