# Text Preprocessing

This notebook contains the text preprocessing or data cleaning step for the Parti Pris corpus. At the end of the previous notebook, we were

Metadata first from ToC to get year, month, volume, issue, etc -> connected to the texts through the 'source_file'





In [None]:
skip_files = {
    "163122_1-1964-09-01.json",
    "163122_1-1964-12-01.json",
    "163122_2-1966-06.json"
}

In [3]:
import re
import os
import json
import pandas as pd

In [7]:
# read the json files from data/transcriptions

transcriptions_dir = "../data/transcriptions"
json_files = [f for f in os.listdir(transcriptions_dir) if f.endswith('.json')]

data = []
for file in json_files:
    with open(os.path.join(transcriptions_dir, file), 'r', encoding='utf-8') as f:
        entry = json.load(f)
        if isinstance(entry, list):
            for item in entry:
                item['source_file'] = file
                data.append(item)
        else:
            entry['source_file'] = file
            data.append(entry)

df = pd.DataFrame(data)
df.head()

Unnamed: 0,author,title,page_range,text,source_file
0,parti pris,manifeste 64-65,4-17,manifeste 64-65 Toute révolution détruit l'anc...,163122_1-1964-09.json
1,la direction.,lettre au lecteur,18-19,lettre au lecteur parti pris se prépare à une ...,163122_1-1964-09.json
2,paul chamberland,bilan d'un combat,20-35,bilan d'un combat paul chamberland Nous lutton...,163122_1-1964-09.json
3,jean-marc piotte,autocritique de parti pris,36-44,autocritique de parti pris jean-marc piotte Pi...,163122_1-1964-09.json
4,pierre maheu,notes pour une politisation,45-56,notes pour une politisation pierre maheu Dans ...,163122_1-1964-09.json


In [10]:
# 163122_1-1964-09-01.json is basically empty because it is an overview of the whole year
# we will remove it
df = df[df['source_file'] != '163122_1-1964-09-01.json']

In [33]:
# 163122_2-1966-06.json is also empty. According to BAnQ, it was never published, so the record
# states "non paru". We will remove it as well
df = df[df['source_file'] != '163122_2-1966-06.json']

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 707 entries, 0 to 826
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   author       707 non-null    object
 1   title        707 non-null    object
 2   page_range   707 non-null    object
 3   text         707 non-null    object
 4   source_file  707 non-null    object
dtypes: object(5)
memory usage: 33.1+ KB


In [13]:
# look at the entries with pierre vallieres
df[df['author'].str.lower().str.contains('pierre vallieres')]

Unnamed: 0,author,title,page_range,text,source_file
434,"Charles GAGNON, Pierre VALLIERES",document Grève de la Fain pour la reconnaissan...,88-91,document Grève de la Faim pour la reconnaissan...,163122_2-1966-11-B.json


In [15]:
df[df['author'].str.lower().str.contains('vallières')]

Unnamed: 0,author,title,page_range,text,source_file
103,pierre vallières,"les grèves de l'été, un tournant?",103,,163122_1-1965-08-A.json
438,Pierre Vallières,Cuba révolutionnaire.,21-27,La réunion en janvier 1966 de la première con ...,163122_2-1967-09.json
692,pierre vallières,"les grèves de l'été, un tournant?",40-40,Il est trop tôt pour affirmer avec certitude q...,163122_1-1965-08-B.json
737,pierre vallières,pour l'union de la gauche,51-52,Les militants de Révolution québécoise ont déc...,163122_1-1965-06-B.json


In [None]:
# Convert author names to lowercase and get unique authors
df['author'] = df['author'].str.lower()
unique_authors = df['author'].unique()
print(unique_authors)

['parti pris' 'la direction.' 'paul chamberland' 'jean-marc piotte'
 'pierre maheu' 'raymond villeneuve' 'laurent girouard'
 'jacques lebrecque' 'andré major' 'denys arcand' 'yvon hussereau'
 'jean racine' 'jacques trudel' 'gérald godin' 'michel euvrard'
 'patrick straram' 'g. g.' 'p.' 'j. d.' 'm. g...' 'p. s' 'p. s.' ''
 'j. f.' 'éditorial' 'pierre lefebvre' 'le bureau exécutif' 'mario dumais'
 'jacques poisson' 'andré brochu' 'déclaration' 'jacques brault'
 'marcel rioux' 'jacques ferron' 'jean-pascal benoist' 'gaétan tremblay'
 'pierre vadeboncoeur' 'un fonctionnaire (québec)' 'gérald fortin'
 'jacques godbout' 'jean pierre lefevre' 'guy bourassa' 'raoul roy'
 'jan depocas' 'françois aquin' 'r. s.' 'p.p. / g. t., p. m.'
 'gilles dostaler' 'gilles bourque et luc racine' 'pierre desrosiers'
 'jean-robert rémillard' 'jean-claude lapointe' 'roger guy'
 'robert mackay' 'camille limoges' 'serge grenier'
 'mouvement de libération populaire et de la revue parti pris'
 'andrée bertrand-ferre

In [22]:
# sort unique authors by article count
author_counts = df['author'].value_counts()
sorted_authors = author_counts.sort_values(ascending=False)
print(sorted_authors)

author
patrick straram                                                 32
pierre maheu                                                    29
unknown                                                         29
gérald godin                                                    27
parti pris                                                      22
                                                                ..
collectif (not explicitly stated, but implies common effort)     1
andrée ferretti-bertrand                                         1
yvon husereau                                                    1
various authors                                                  1
robert boily                                                     1
Name: count, Length: 214, dtype: int64


In [37]:
df[df['author'].str.lower().str.contains('guenard')]

Unnamed: 0,author,title,page_range,text,source_file


In [35]:
# unique source files and counts
unique_sources = df['source_file'].unique()
source_counts = df['source_file'].value_counts()
print(unique_sources)
print(source_counts)

['163122_1-1964-09.json' '163122_2-1966-02-B.json' '163122_1-1965-04.json'
 '163122_1-1965-06-A.json' '163122_2-1968-05-B.json'
 '163122_2-1967-01-A.json' '163122_1-1964-05.json'
 '163122_1-1965-08-A.json' '163122_1-1965-01-B.json'
 '163122_2-1967-03-A.json' '163122_1-1964-04.json' '163122_1-1964-12.json'
 '163122_2-1966-01.json' '163122_1-1965-05.json' '163122_2-1968-05-C.json'
 '163122_2-1966-09-A.json' '163122_2-1966-05-A.json'
 '163122_1-1965-02.json' '163122_1-1963-11.json' '163122_2-1968-03-B.json'
 '163122_1-1964-03.json' '163122_2-1967-10-B.json'
 '163122_1-1965-12-A.json' '163122_2-1968-02.json'
 '163122_1-1964-06-A.json' '163122_2-1966-11-A.json'
 '163122_1-1964-02.json' '163122_1-1963-10.json' '163122_1-1965-10-A.json'
 '163122_2-1967-05-A.json' '163122_2-1968-04-A.json'
 '163122_2-1968-01-B.json' '163122_1-1965-03.json' '163122_1-1964-01.json'
 '163122_1-1965-10-B.json' '163122_2-1967-05-B.json'
 '163122_2-1968-04-B.json' '163122_2-1966-04.json'
 '163122_2-1968-01-A.json' '