# Deep learning projet

## Marie PHILIPPE & Claire SERRAZ - M2 D3S

# 1. Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk import stem, WordNetLemmatizer
nltk.download('stopwords')
wnl = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /Users/marie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 2. Import the data

In [3]:
# Choose the path

# Marie
os.chdir("/Users/marie/Desktop/Cours/S1/DL/Project/Data")

In [4]:
# Import the fake and true news

fake=pd.read_csv("Fake.csv")
true=pd.read_csv("True.csv")

In [5]:
# See how the data looks

fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [6]:
true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [7]:
fake.shape

(23481, 4)

In [8]:
true.shape

(21417, 4)

# 3. Merge the fake and true datasets

In [9]:
# Add the class
fake['class']='fake'
true['class']='true'

In [10]:
# Check class
fake.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",fake


In [11]:
true.head()

Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True


In [12]:
# Merge the 2 datasets
data = fake.append(true)
data = data.reset_index(drop=True)

In [13]:
data.shape

(44898, 5)

In [14]:
data.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",fake


# 4. Clean the text

In [15]:
# Define the stopwords
sw = stopwords.words('english')

In [16]:
# Functions to clean the text

# Convert text to lowercase
def convert_text_to_lowercase(df, colname):
    df[colname] = df[colname].str.lower()
    return df

def not_regex(pattern):
        return r"((?!{}).)".format(pattern)

# Remove punctuation and new line characters '\n'
def remove_punctuation(df, colname):
    df[colname] = df[colname].str.replace('\n', ' ')
    df[colname] = df[colname].str.replace('\r', ' ')
    alphanumeric_characters_extended = '(\\b[-/]\\b|[a-zA-Z0-9])'
    df[colname] = df[colname].str.replace(not_regex(alphanumeric_characters_extended), ' ')
    return df

# Tokenize sentences
def tokenize_sentence(df, colname):
    df[colname] = df[colname].str.split()
    return df

# Remove the stopwords
def remove_stop_words(df, colname):
    df[colname] = df[colname].apply(lambda x: [word for word in x if word not in sw])
    return df

# Lemmatisation (get the root of words)
def lemm(df, colname):
    df[colname] = df[colname].apply(lambda x: [wnl.lemmatize(word) for word in x])
    return df

# Convert tokenized text to text
def reverse_tokenize_sentence(df, colname):
    df[colname] = df[colname].map(lambda word: ' '.join(word))
    return df

# Apply all the functions the text
def text_cleaning(df, colname):
    df = (
        df
        .pipe(convert_text_to_lowercase, colname)
        .pipe(remove_punctuation, colname)
        .pipe(tokenize_sentence, colname)
        .pipe(remove_stop_words, colname)
        .pipe(lemm, colname)
        .pipe(reverse_tokenize_sentence, colname)
    )
    return df

In [17]:
# Copy of our data
data_clean=data.copy()

In [18]:
# Apply the cleaning on the text
text_cleaning(data_clean, 'text') 

  df[colname] = df[colname].str.replace(not_regex(alphanumeric_characters_extended), ' ')


Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,donald trump wish american happy new year leav...,News,"December 31, 2017",fake
1,Drunk Bragging Trump Staffer Started Russian ...,house intelligence committee chairman devin nu...,News,"December 31, 2017",fake
2,Sheriff David Clarke Becomes An Internet Joke...,friday revealed former milwaukee sheriff david...,News,"December 30, 2017",fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,christmas day donald trump announced would bac...,News,"December 29, 2017",fake
4,Pope Francis Just Called Out Donald Trump Dur...,pope francis used annual christmas day message...,News,"December 25, 2017",fake
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,brussels reuters nato ally tuesday welcomed pr...,worldnews,"August 22, 2017",true
44894,LexisNexis withdrew two products from Chinese ...,london reuters lexisnexis provider legal regul...,worldnews,"August 22, 2017",true
44895,Minsk cultural hub becomes haven from authorities,minsk reuters shadow disused soviet-era factor...,worldnews,"August 22, 2017",true
44896,Vatican upbeat on possibility of Pope Francis ...,moscow reuters vatican secretary state cardina...,worldnews,"August 22, 2017",true


In [19]:
# Apply the cleaning on the title
text_cleaning(data_clean, 'title')

  df[colname] = df[colname].str.replace(not_regex(alphanumeric_characters_extended), ' ')


Unnamed: 0,title,text,subject,date,class
0,donald trump sends embarrassing new year eve m...,donald trump wish american happy new year leav...,News,"December 31, 2017",fake
1,drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...,News,"December 31, 2017",fake
2,sheriff david clarke becomes internet joke thr...,friday revealed former milwaukee sheriff david...,News,"December 30, 2017",fake
3,trump obsessed even obama name coded website i...,christmas day donald trump announced would bac...,News,"December 29, 2017",fake
4,pope francis called donald trump christmas speech,pope francis used annual christmas day message...,News,"December 25, 2017",fake
...,...,...,...,...,...
44893,fully committed nato back new u approach afgha...,brussels reuters nato ally tuesday welcomed pr...,worldnews,"August 22, 2017",true
44894,lexisnexis withdrew two product chinese market,london reuters lexisnexis provider legal regul...,worldnews,"August 22, 2017",true
44895,minsk cultural hub becomes authority,minsk reuters shadow disused soviet-era factor...,worldnews,"August 22, 2017",true
44896,vatican upbeat possibility pope francis visiti...,moscow reuters vatican secretary state cardina...,worldnews,"August 22, 2017",true


In [20]:
# Export file 
data_clean.to_csv('data.csv', index=False)