# Propaganda analysis - keyword extraction from newspapers

### Load Impresso OCR data for french WW1 content

In [2]:
import pandas as pd

df_french = pd.read_csv("https://mbien-public.s3.eu-central-1.amazonaws.com/dh-412/french.csv", sep=";")
df_french.head()

Unnamed: 0,uid,type,language,title,size,country,newspaper,issue,pages,nb_pages,...,year,is_on_front,date,persons_mentioned,locations_mentioned,content,access_right,content_provider,is_content_available,"[total:10000,available:10000]"
0,GDL-1914-07-29-a-i0005,,fr,L'Italie et la guerre,371,CH,GDL,GDL-1914-07-29-a,12,2,...,1914,True,1914-07-29T00:00:00Z,,Italie|Rome|Italie|Italie|Italie|Autriche|Ital...,L'Italie et la guerre (Lettre Ae Honte) (De no...,OpenPrivate,SNL,y,
1,GDL-1914-07-29-a-i0029,,fr,CHRONIQUE VAtAISANNE,279,CH,GDL,GDL-1914-07-29-a,4,1,...,1914,False,1914-07-29T00:00:00Z,,Sion|Suisse,"CHRONIQUE VAtAISANNE Sion, 27 juillet. Les vin...",OpenPrivate,SNL,y,
2,GDL-1914-08-07-a-i0028,,fr,,1717,CH,GDL,GDL-1914-08-07-a,3,1,...,1914,False,1914-08-07T00:00:00Z,Les Anglais|Les Hollandais|Les Autrichiens|Con...,Angleterre|Autriche|Russie|Bâle|Mulhouse|Berli...,La guerre sur mer. raqueDots capturés. Londres...,OpenPrivate,SNL,y,
3,GDL-1914-08-01-a-i0026,,fr,A la population de Lausanne,287,CH,GDL,GDL-1914-08-01-a,3,1,...,1914,False,1914-08-01T00:00:00Z,,Lausanne|Lausanne|Vevey|Genève,A la population de Lausanne Le comité de la So...,OpenPrivate,SNL,y,
4,GDL-1914-08-01-a-i0023,,fr,NOUVELLES DIVERSES,40,CH,GDL,GDL-1914-08-01-a,2,1,...,1914,False,1914-08-01T00:00:00Z,,Rome,NOUVELLES DIVERSES Bnlaarie. Le gouvernement d...,OpenPrivate,SNL,y,


### Prepare the text processing and tokenization pipeline

In [3]:
df_french.content

0       L'Italie et la guerre (Lettre Ae Honte) (De no...
1       CHRONIQUE VAtAISANNE Sion, 27 juillet. Les vin...
2       La guerre sur mer. raqueDots capturés. Londres...
3       A la population de Lausanne Le comité de la So...
4       NOUVELLES DIVERSES Bnlaarie. Le gouvernement d...
                              ...                        
9995    Etats-Unis et Allemagne L'affaire du " Lyman L...
9996    La situation en Allemagne Je vais essayé de tr...
9997    En Russie Démarche de la noblesse 'en faveur d...
9998    CANTONJDE VÂUO Grand Conseil Séance du mercred...
9999    Les Sports 8 PORTS D'HIVER i Le championnat de...
Name: content, Length: 10000, dtype: object

In [4]:
import spacy
from tqdm.notebook import tqdm

tqdm.pandas()

nlp_fr = spacy.load('fr_core_news_sm', disable=["ner", "parser"])
nlp_de = spacy.load('de_core_news_sm', disable=["ner", "parser"])

  from pandas import Panel


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# preprocessing
pos_to_keep = ['ADJ', 'ADV', 'NOUN', 'PROPN', 'VERB']

def lematization_pos_tagging(spacy_handle, text):
    doc = spacy_handle(str(text))
    lemmatized = [x.lemma_ for x in doc if x.pos_ in pos_to_keep and x.is_stop != True]
    return lemmatized


In [None]:
tokens_french = df_french.content.progress_apply(lambda row: ' '.join(lematization_pos_tagging(nlp_fr, row)))

### Do the same for french reference corpus (non-WW1 subcorpus)

In [11]:
df_french_reference = pd.read_csv("french_reference.csv", sep=";")
tokens_reference_french = df_french_reference.content.progress_apply(lambda row: ' '.join(lematization_pos_tagging(nlp_fr, row)))

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




### Save the french files to the pickle (uploaded to S3)

In [13]:
import pickle
with open("tokens_french.pickle", "wb") as f:
    pickle.dump(tokens_french, f)

with open("tokens_reference_french.pickle", "wb") as f:
    pickle.dump(tokens_reference_french, f)

### Do the same for german newspapers

In [6]:
df_german = pd.read_csv("german.csv", sep=";")
tokens_german = df_german.content.progress_apply(lambda row: ' '.join(lematization_pos_tagging(nlp_de, row)))

HBox(children=(FloatProgress(value=0.0, max=7385.0), HTML(value='')))




In [9]:
tokens_german

0       Morgenblatt ZeUnn Mrchtt M MgW Mittwoch mpril ...
1       Grftes Morgenblatt Dtt Mchtt Mnz M WM Mittwoch...
2       Kandel YerkeHr elreidebilrst Zsnch Getreidebör...
3       trwerb Vettrauende plötzlich Pfla Ver fchwleri...
4       lnlpalti ° l Np Neu bonne Insertionspreise Zür...
                              ...                        
7380    ßidgenossenschafi Dementi Nov . zuständig Ttcl...
7381    VlDNNN l tusltiyle vanml c besonder Landes-Vür...
7382    Macke vcsbllw sel Än-tr Vl VndcrN ilbcrslnssif...
7383    M KWmdes WarnWzsz^lchen Ilsza H letzt Rede Irr...
7384    sammlung sprache halten Demonstrant beabsichti...
Name: content, Length: 7385, dtype: object

In [7]:
df_german_reference = pd.read_csv("german_reference.csv", sep=";")
tokens_reference_german = df_german_reference.content.progress_apply(lambda row: ' '.join(lematization_pos_tagging(nlp_de, row)))

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [8]:
import pickle
with open("tokens_german.pickle", "wb") as f:
    pickle.dump(tokens_german, f)

with open("tokens_reference_german.pickle", "wb") as f:
    pickle.dump(tokens_reference_german, f)

# Depickle and analyse all

In [8]:
import pandas as pd

tokens_german = pd.read_pickle("https://mbien-public.s3.eu-central-1.amazonaws.com/dh-412/tokens_german.pickle")
tokens_reference_german = pd.read_pickle("https://mbien-public.s3.eu-central-1.amazonaws.com/dh-412/tokens_reference_german.pickle")
tokens_french = pd.read_pickle("https://mbien-public.s3.eu-central-1.amazonaws.com/dh-412/tokens_french.pickle")
tokens_reference_french = pd.read_pickle("https://mbien-public.s3.eu-central-1.amazonaws.com/dh-412/tokens_reference_french.pickle")

In [18]:
tokens_reference_french

0       nouvelle ™ édition escadre français baltique r...
1       offre place demande suite bon sommelièr puotog...
2       chronique zurichois correspondant j ~ v idée m...
3       banque cantonal ineuchâtelois banque délivre j...
4       ANNONCES publicita société Anonyme Suisse publ...
                              ...                        
9995    attentat Prague Prague janvier enquête prélimi...
9996    Francfort janvier gazette Francfort apprendre ...
9997    | iHHrfiii | iihm Hj ii iiiiuiiiiihhifii ili n...
9998    Lausanne janvier barrage illusoire discours mo...
9999    confédération suisse legation petrograd départ...
Name: content, Length: 10000, dtype: object

In [15]:
tfidf = TfidfVectorizer()
tfidf.fit(tokens_reference_french)

confederation = tfidf.transform(tokens_french)

In [28]:
tfidf_fr_df = pd.DataFrame(confederation.toarray().T)


In [30]:
tfidf_fr_df.index = tfidf.get_feature_names()
tfidf_fr_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
tfidf_fr_df = tfidf_fr_df.sort_values(by = 0, ascending=False)
tfidf_fr_df[:50]

In [25]:
import numpy as np

feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(confederation.toarray()).flatten()[::-1]

n = 3
top_n = feature_array[tfidf_sorting][:n]

KeyboardInterrupt: 