<div align="center"><h1> Projet Data Science </h1></div>
<div align="center"><h2> Classification d'assertions selon leur valeurs de véracité ( automatic fact-checking ) </h2></div>
<h3>Membre du groupe</h3>
<ul>
    <li>Meriem AMERAOUI</li>
    <li>Dounia BELABIOD</li>
    <li>Jihene BOUHLEL</li>
    <li>Bahaa Eddine NIL</li>
</ul>

<div class="alert alert-block alert-info" align="center">
    <h1>
        Executing the basic
    </h1>
</div>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Reding data
def read_data(*args, **kwargs):
    data = kwargs.get("data", 'test')
    number = kwargs.get("number", 1000)
    labels = ["ID", "Text", "Date", "TruthRating", "RatingName", "Author", "Headline", 
              "NamedEntitiesClaim", "NamedEntitiesArticle", "Keywords", "Source", "SourceURL", "Link", "Language"]
    
    if(data not in ['all', 'polsno', 'test']):
        print('Please selecte somthing in [\'all\', \'polsno\', \'test\']')
    else:
        test = pd.read_csv('datasets/test.csv', sep = ',', names = labels, skiprows = 1)
        pol1 = pd.read_csv('datasets/pol1.csv', sep = ',', names = labels, skiprows = 1)
        pol2 = pd.read_csv('datasets/pol2.csv', sep = ',', names = labels, skiprows = 1)
        sno1 = pd.read_csv('datasets/sno1.csv', sep = ',', names = labels, skiprows = 1)
        sno2 = pd.read_csv('datasets/sno2.csv', sep = ',', names = labels, skiprows = 1)
        reste = pd.read_csv('datasets/reste.csv', sep = ',', names = labels, skiprows = 1)

        pol = pol1.append(pol2, sort = False).reset_index(drop = True)
        sno = sno1.append(sno2, sort = False).reset_index(drop = True)
        polsno = pol.append(sno, sort = False).reset_index(drop = True)
        everything = polsno.append(reste, sort = False).reset_index(drop = True)
        
        if(data == 'polsno'):
            if(number > polsno.shape[0]):
                print(f'Not enough data !\nReading only {polsno.shape[0]} claims')
                number = polsno.shape[0]
            else:
                print(f'Reading successfully {number} claims')
            df_func = polsno.sample(n = number).reset_index(drop = True)

        if(data == 'all'):
            if(number > everything.shape[0]):
                print(f'Not enough data !\nReading only {everything.shape[0]} claims')
                number = everything.shape[0]
            else:
                print(f'Reading successfully {number} claims')
            df_func = everything.sample(n = number).reset_index(drop = True)

        if(data == 'test'):
            if(number > test.shape[0]):
                print(f'Not enough data !\nReading only {test.shape[0]} claims')
                number = test.shape[0]
            else:
                print(f'Reading successfully {number} claims')
            df_func = test[:number]
            #df_func = test.sample(n = number).reset_index(drop = True)
        
        for column in df_func.columns:
            if(column == 'RatingName'):
                df_func[column].replace(to_replace = True, value = 'TRUE', inplace = True)
                df_func[column].replace(to_replace = False, value = 'FALSE', inplace = True)
        
        df_func.to_csv('datasets/generated.csv', sep = ';', index = False)
        print(f'\nFile \'generated.csv\' created !\nYou can found this csv in the following path datasets/generated.csv')

        return df_func

<div class="alert alert-block alert-info" align="center">
    <h1>
        Generating & saving data
    </h1>
</div>

<h2>read_data(data='test', number = '1000')</h2>
<ul>
    <li><b>data</b></li>
	<ul>
		<li>'all' for all website</li>
		<li>'polsno' for politifact's and snopes' claims </li>
		<li>'test' for csv test (all sites) with 10000 claims max</li>
	</ul>
        <li><b>number</b></li>
	<ul>
		<li>number of claims to generate</li>
	</ul>
</ul>

In [None]:
df = read_data(data = 'test', number = 250)

<div class="alert alert-block alert-info" align="center">
    <h1>
        Printing informations
    </h1>
</div>

## Shape

In [None]:
print(f'Shape :\n{df.shape}')

## Informations

In [None]:
print(f'Informations :')
df.info()

## Description

In [None]:
print(f'Description :')
display(df.describe())

## Printing some lines

In [None]:
print(f'Printing some lines :')
display(df.head())

## Affichage d'informations sur toutes les colonnes

In [None]:
for column in df.columns:
      print(f'Nombre de valeurs nulles pour {column} :\n{df[column].isnull().value_counts()}\n')

## Affichage des colonnes vides

In [None]:
array = []
for column in df.columns:
    value = False
    value = df[column].isnull().any()
    if value:
        array.append(column)
print(f'Nombre de colonnes vides : {len(array)}\nLes colonnes vide sont :\n{array}')

## Description de toutes les colonnes

In [None]:
for column in df.columns:
    display(df[column].describe())

## Get a series of unique values in each column of the dataframe

In [None]:
for column in df.columns:
    uniqueValues = df[column].unique()
    print(f'Number of unique elements in column {column} : {len(uniqueValues)}, values & type :\n{uniqueValues}\n')

## Affichage du nombre des différents TruthRating

In [None]:
print(f'La colonne TruthRating contient :')
print(f'{df[df["TruthRating"]==-1]["ID"].count()} Other')
print(f'{df[df["TruthRating"]==1]["ID"].count()} False')
print(f'{df[df["TruthRating"]==2]["ID"].count()} Mixture')
print(f'{df[df["TruthRating"]==3]["ID"].count()} True')

<div class="alert alert-block alert-info" align="center">
    <h1>
        Visualization
    </h1>
</div>

In [None]:
chart = sns.countplot(x = 'Source', data = df)
plt.setp(chart.get_xticklabels(), rotation = 45, horizontalalignment = 'right')
plt.show()

In [None]:
chart = sns.catplot(x = 'Source', col = 'RatingName', kind = 'count', data = df)
for ax in chart.axes.ravel():
    plt.setp(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right')
plt.show()

In [None]:
chart = sns.catplot('Source', data = df, hue = 'RatingName', kind = 'count')
for ax in chart.axes.ravel():
    plt.setp(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right')
plt.show()

In [None]:
sns.heatmap(df.isnull(), cbar = False)
plt.show()