In [131]:
%pip install numpy

Note: you may need to restart the kernel to use updated packages.




In [132]:
%pip install pandas






In [133]:
import pandas as pd
import os

In [134]:
def load_and_clean_multiple_dataframes(file_paths, drop='rows'):
    
    dataframes = {}
    
    for file_path in file_paths:
        try:
            # Vérifie si le fichier existe
            if not os.path.exists(file_path):
                print(f"Le fichier '{file_path}' n'existe pas.")
                continue
            
            # Charger le fichier CSV
            data = pd.read_csv(file_path)
            print(f"\n--- Chargement réussi : {file_path} ---")
            
            # Aperçu du DataFrame
            print(f"\n--- Aperçu des données : {file_path} ---")
            print(data.head())
            
            # Informations sur les colonnes et types de données
            print(f"\n---Informations sur le DataFrame: {file_path}---")
            print(data.info())
            
            # Statistiques descriptives
            print(f"\n---Statistiques descriptives : {file_path}---")
            print(data.describe(include='all'))

            # Analyse des valeurs non nulles et manquantes
            print(f"\n--- Aperçu des valeurs non nulles et manquantes: {file_path} ---")
            for col in data.columns:
                non_null_count = data[col].count()
                total_count = len(data[col])
                missing_count = total_count - non_null_count
                missing_percentage = (missing_count / total_count) * 100
                print(f"Colonne: {col}")
                print(f"  Non-null count: {non_null_count}")
                print(f"  Missing count: {missing_count} ({missing_percentage:.2f}%)")
                print("-" * 40)
            
            # Suppression des valeurs manquantes
            if drop == 'rows':
                print("\n--- Suppression des lignes contenant des valeurs manquantes ---")
                data_cleaned = data.dropna()  # Supprime les lignes avec valeurs manquantes
            elif drop == 'columns':
                print("\n--- Suppression des colonnes contenant des valeurs manquantes ---")
                data_cleaned = data.dropna(axis=1)  # Supprime les colonnes avec valeurs manquantes
            else:
                print("\n--- Aucun nettoyage appliqué ---")
                data_cleaned = data
            
            print(f"\nDataFrame nettoyé : {data_cleaned.shape[0]} lignes et {data_cleaned.shape[1]} colonnes")
            
            # Ajouter le DataFrame nettoyé au dictionnaire
            dataframes[file_path] = data_cleaned
        
        except pd.errors.EmptyDataError:
            print(f"Le fichier '{file_path}' est vide.")
        except pd.errors.ParserError as e:
            print(f"Erreur de parsing dans le fichier '{file_path}' : {e}")
        except Exception as e:
            print(f"Une erreur s'est produite lors du chargement de '{file_path}' : {e}")
    
    return dataframes

In [135]:
file_paths = [os.path.join('..', 'data', 'raw_fake_news', 'gossipcop_fake.csv'),
              os.path.join('..', 'data', 'raw_fake_news', 'gossipcop_real.csv'), os.path.join('..', 'data', 'raw_fake_news', 'politifact_fake.csv'),
              os.path.join('..', 'data', 'raw_fake_news', 'politifact_real.csv')] 
# Charger et nettoyer les fichiers
cleaned_dataframes = load_and_clean_multiple_dataframes(file_paths, drop='rows')


--- Chargement réussi : ..\data\raw_fake_news\gossipcop_fake.csv ---

--- Aperçu des données : ..\data\raw_fake_news\gossipcop_fake.csv ---
                     id                                           news_url  \
0  gossipcop-2493749932  www.dailymail.co.uk/tvshowbiz/article-5874213/...   
1  gossipcop-4580247171  hollywoodlife.com/2018/05/05/paris-jackson-car...   
2   gossipcop-941805037  variety.com/2017/biz/news/tax-march-donald-tru...   
3  gossipcop-2547891536  www.dailymail.co.uk/femail/article-3499192/Do-...   
4  gossipcop-5476631226  variety.com/2018/film/news/list-2018-oscar-nom...   

                                               title  \
0  Did Miley Cyrus and Liam Hemsworth secretly ge...   
1  Paris Jackson & Cara Delevingne Enjoy Night Ou...   
2  Celebrities Join Tax March in Protest of Donal...   
3  Cindy Crawford's daughter Kaia Gerber wears a ...   
4      Full List of 2018 Oscar Nominations – Variety   

                                           tweet_ids 

In [136]:
# Accéder à chaque DataFrame nettoyé
gossipcop_fake_df = cleaned_dataframes[os.path.join('..', 'data', 'raw_fake_news', 'gossipcop_fake.csv')]
gossipcop_real_df = cleaned_dataframes[os.path.join('..', 'data', 'raw_fake_news', 'gossipcop_real.csv')]
politifact_fake_df = cleaned_dataframes[os.path.join('..', 'data', 'raw_fake_news', 'politifact_fake.csv')]
politifact_real_df = cleaned_dataframes[os.path.join('..', 'data', 'raw_fake_news', 'politifact_real.csv')]

# Ajouter une colonne "Annotations" pour chaque DataFrame
gossipcop_fake_df["Annotations"] = ["Fake_news"] * len(gossipcop_fake_df)
gossipcop_real_df["Annotations"] = ["Good_news"] * len(gossipcop_real_df)
politifact_fake_df["Annotations"] = ["Fake_news"] * len(politifact_fake_df)
politifact_real_df["Annotations"] = ["Good_news"] * len(politifact_real_df)

# Fusionner les DataFrames annotés
combined_df = pd.concat([gossipcop_fake_df, gossipcop_real_df, politifact_fake_df, politifact_real_df], ignore_index=True)

# Renommer la colonne "Title" en "Title_news"
combined_df.rename(columns={"title": "Title_news"}, inplace=True)


# Afficher le DataFrame fusionné
print("\n--- DataFrame fusionné avec annotations ---")
print(combined_df.head())



--- DataFrame fusionné avec annotations ---
                     id                                           news_url  \
0  gossipcop-2493749932  www.dailymail.co.uk/tvshowbiz/article-5874213/...   
1  gossipcop-4580247171  hollywoodlife.com/2018/05/05/paris-jackson-car...   
2   gossipcop-941805037  variety.com/2017/biz/news/tax-march-donald-tru...   
3  gossipcop-2547891536  www.dailymail.co.uk/femail/article-3499192/Do-...   
4  gossipcop-5476631226  variety.com/2018/film/news/list-2018-oscar-nom...   

                                          Title_news  \
0  Did Miley Cyrus and Liam Hemsworth secretly ge...   
1  Paris Jackson & Cara Delevingne Enjoy Night Ou...   
2  Celebrities Join Tax March in Protest of Donal...   
3  Cindy Crawford's daughter Kaia Gerber wears a ...   
4      Full List of 2018 Oscar Nominations – Variety   

                                           tweet_ids Annotations  
0  284329075902926848\t284332744559968256\t284335...   Fake_news  
1  992895508267

In [137]:
combined_df

Unnamed: 0,id,news_url,Title_news,tweet_ids,Annotations
0,gossipcop-2493749932,www.dailymail.co.uk/tvshowbiz/article-5874213/...,Did Miley Cyrus and Liam Hemsworth secretly ge...,284329075902926848\t284332744559968256\t284335...,Fake_news
1,gossipcop-4580247171,hollywoodlife.com/2018/05/05/paris-jackson-car...,Paris Jackson & Cara Delevingne Enjoy Night Ou...,992895508267130880\t992897935418503169\t992899...,Fake_news
2,gossipcop-941805037,variety.com/2017/biz/news/tax-march-donald-tru...,Celebrities Join Tax March in Protest of Donal...,853359353532829696\t853359576543920128\t853359...,Fake_news
3,gossipcop-2547891536,www.dailymail.co.uk/femail/article-3499192/Do-...,Cindy Crawford's daughter Kaia Gerber wears a ...,988821905196158981\t988824206556172288\t988825...,Fake_news
4,gossipcop-5476631226,variety.com/2018/film/news/list-2018-oscar-nom...,Full List of 2018 Oscar Nominations – Variety,955792793632432131\t955795063925301249\t955798...,Fake_news
...,...,...,...,...,...
21402,politifact6931,http://www.politifact.com/truth-o-meter/promis...,The Obameter: Introduce a comprehensive immigr...,21096374968\t21096771824\t9413452992876544\t12...,Good_news
21403,politifact13619,http://www.cnn.com/2017/01/05/politics/border-...,"Trump asking Congress, not Mexico, to pay for ...",817357495047979008\t817357627566985217\t817357...,Good_news
21404,politifact329,https://web.archive.org/web/20080131000131/htt...,Change We Can Believe In,634287923135909888\t946743411100536832\t946816...,Good_news
21405,politifact4720,http://www.youtube.com/watch?v=EhyMplwY6HY,Romneys ProLife Conversion Myth or Reality Jun...,188871706637647874,Good_news


In [138]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21407 entries, 0 to 21406
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           21407 non-null  object
 1   news_url     21407 non-null  object
 2   Title_news   21407 non-null  object
 3   tweet_ids    21407 non-null  object
 4   Annotations  21407 non-null  object
dtypes: object(5)
memory usage: 836.3+ KB


In [139]:
# Enregistrement du DataFrame en fichier CSV
output_path = os.path.join('..', 'data', 'processed_fake_news', 'mabs.csv')


In [140]:
combined_df.to_csv(output_path, index=False)