In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
import string

!pip install langdetect
!pip install umap-learn

from langdetect import detect
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from nltk.corpus import stopwords as nltk_stopwords
from umap import UMAP
import string
from sklearn.model_selection import train_test_split

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=6833bdf38603d0788b3956a7af3acc056e0c367919567d0bcc85c4bdf81592e9
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
Collecting umap-learn
  Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m934.6 kB/s[0m eta [36m0:00:00[0m
Collecting pynndescent>=0.5 (from umap-learn)
  Downloadi

In [5]:
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


# Loading data and tokenization

In [2]:
def clean_data(dato_csv):
    """
    This function cleans the input DataFrame and ouput DataFrame.
    Remove Missing Values: Drops rows with any missing values.
    Convert to String: Ensures all data is treated as strings for consistency.
    Remove Duplicates: Drops duplicate rows.
    Filter Short Texts: Removes rows where the length of the text is less than 20 characters.
    Convert to Lowercase: Converts all text to lowercase.
    Remove Special Characters: Removes any characters that are not words or whitespace N = 20.
    """
    dato_csv = dato_csv.dropna()  # Remove rows with missing values
    dato_csv = dato_csv.astype(str)  # Convert all data to string
    dato_csv = dato_csv.drop_duplicates()  # Remove duplicate rows
    dato_csv = dato_csv[dato_csv['text'].str.len() >= 20]  # Remove rows where 'text' length is less than 20

    # esto ya lo hace por defecto tfidvecotirzer:
    #for column in dato_csv.columns:
        #dato_csv[column] = dato_csv[column].str.lower()  # Convert to lowercase-->done dirctly by TFIDvectorizer
       # dato_csv[column] = dato_csv[column].apply(lambda x: re.sub(r'[^\w\s]', '', x))  # Remove special characters

    return dato_csv

In [6]:
# File paths for input CSV files

file_path_test = 'data/test.csv'
file_path_train = 'data/train.csv'


#file_path_test = '/content/drive/My Drive/DATALAB/test.csv'

#file_path_train = '/content/drive/My Drive/DATALAB/train.csv'


# Read CSV files
df_test = pd.read_csv(file_path_test)
df_train = pd.read_csv(file_path_train)

# Clean data
df_test = clean_data(df_test)
df_train = clean_data(df_train)


print(df_train.head())

  id                                              title              author  \
0  0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1  1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2  2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3  3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4  4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...     1  
1  Ever get the feeling your life circles the rou...     0  
2  Why the Truth Might Get You Fired October 29, ...     1  
3  Videos 15 Civilians Killed In Single US Airstr...     1  
4  Print \nAn Iranian woman has been sentenced to...     1  


# We keep only the text in English

In [7]:
# Function to detect and filter only English text from a corpus and edit the original DataFrame
def filter_english_text_edit_df(df: pd.DataFrame, text_column: str) -> pd.DataFrame:
    # Initialize an empty list to store indices of rows to keep
    keep_indices = []

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        text = row[text_column]
        try:
            # Detect the language of the text
            if detect(text) == 'en':
                # If the language is English, add the index to the list of keep_indices
                keep_indices.append(index)
        except:
            # If language detection fails (raises an exception), skip the text
            pass

    # Filter the original DataFrame to keep only the rows where text is in English
    filtered_df = df.loc[keep_indices].reset_index(drop=True)

    return filtered_df

# VECTORIZATION: Language detection function and TF-IDF vectors.
We save the unigrams TF-IDF vectors (single words) and bigrams TF-IDF vectors (pairs of consecutive words). We create visualizations to show the distribution of the vectors and how the unigrams and bigrams are repeated in the case of fake or real news.

In [8]:
# Function to vectorize text using TF-IDF
def get_tfidf_vectors(corpus: np.ndarray, stop_words: str, max_features: int, n: int) -> np.ndarray:
    # Create a TfidfVectorizer object with the given parameters:
    # - stop_words: language for stop words (e.g., 'english') or None to include all words
    # - max_features: maximum number of features (terms) to consider when vectorizing
    # - ngram_range: range of n-grams to consider; (n, n) means only n-grams of size 'n'
    vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=max_features, ngram_range=(n, n))

    # Fit the vectorizer to the corpus and transform the text data into TF-IDF vectors
    vectorized = vectorizer.fit_transform(corpus)

    # Return the resulting TF-IDF vectors
    return vectorized

In [9]:
# Filter to only include English text
df_train_en = filter_english_text_edit_df(df_train, 'text')


labels1 = df_train_en.copy()['label'].values
labels = [int(label) for label in labels1]
filtered_corpus = df_train_en.copy()['text'].values

max_features = 30  #esto se puede cambiar

unigram_vectors_without_stopwords = get_tfidf_vectors(filtered_corpus, 'english', max_features, 1)
bigram_vectors_without_stopwords = get_tfidf_vectors(filtered_corpus, 'english', max_features, 2)


In [10]:
#separo en train y validation: Uso X=los vectores que calcule a partir de la amtrix Tfid (en este caso unigramas).
# y como 'y' a los labels.

X_train, X_test, y_train, y_test = train_test_split(unigram_vectors_without_stopwords, labels, random_state=1)



print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
#print(f'y_train shape: {y_train.shape}')

X_train shape: (13420, 30)
X_test shape: (4474, 30)


# Entrenar el modelo y Evaluar métricas


In [13]:
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score # Import precision_score, recall_score, and f1_score

# Logistic Regression

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.8220831470719714
Precision: 0.7690315898498188
Recall: 0.8092643051771117
F1 Score: 0.7886351566648964


# Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.8430934286991506
Precision: 0.8034279592929834
Recall: 0.8174386920980926
F1 Score: 0.8103727714748784
