#importing data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
ArtDf = pd.read_csv("/content/Art - Art.csv")
SportDf = pd.read_csv("/content/Sports - Sports.csv")
EconamyDf = pd.read_csv("/content/Economy - Economy.csv")
ArtDf = ArtDf.drop("Unnamed: 0",axis=1)
SportDf = SportDf.drop("Unnamed: 0",axis=1)
EconamyDf = EconamyDf.drop("Unnamed: 0",axis=1)

#cleaning data

In [3]:
ArtDf.shape[0], SportDf.shape[0], EconamyDf.shape[0]

(10457, 9980, 10000)

In [4]:
min_rows = SportDf.shape[0]

In [5]:
while(ArtDf.shape[0] > min_rows):
  ArtDf = ArtDf.iloc[1:]
while(EconamyDf.shape[0] > min_rows):
  EconamyDf = EconamyDf.iloc[1:]

ArtDf.shape[0], SportDf.shape[0], EconamyDf.shape[0]

(9980, 9980, 9980)

In [6]:
DataDf = pd.concat([ArtDf,SportDf,EconamyDf], ignore_index=True)
DataDf

Unnamed: 0,Article Title,Category
0,\n\nJohannesburg highlights African art\n\n\n\...,Art
1,"\n\nDecade on, Iraq to replace iconic Saddam s...",Art
2,\n\nVideo project by late Amal Kenawy showcase...,Art
3,\n\nEgypt paintings might fetch record price a...,Art
4,\n\nSleeping Beauty recreated in Kiev museum\n...,Art
...,...,...
29935,Egypt imposes three-month export ban on cookin...,economy
29936,"Egypt bans export of wheat, other staples for ...",economy
29937,"Egypt’s annual inflation hit 10% in February, ...",economy
29938,INTERVIEW: EBRD supports Egypt’s green transfo...,economy


#cleaning sapces and etc.

In [7]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess text
def preprocess_text(text):
    # Remove punctuations
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)

    # Convert to lowercase
    tokens_lower = [word.lower() for word in tokens]

    # Remove stopwords and prepositions
    stop_words = set(stopwords.words('english'))
    prepositions = set(["about", "above", "across", "after", "against", "along", "among", "around", "at",
                        "before", "behind", "below", "beneath", "beside", "between", "beyond", "but", "by",
                        "despite", "down", "during", "except", "for", "from", "in", "inside", "into", "like",
                        "near", "of", "off", "on", "onto", "out", "outside", "over", "past", "since", "through",
                        "throughout", "till", "to", "toward", "under", "underneath", "until", "up", "upon", "with",
                        "within", "without"])
    filtered_tokens = [word for word in tokens_lower if word not in stop_words and word not in prepositions]

    return ' '.join(filtered_tokens)

# Apply preprocessing function to the 'text_column'
DataDf['Article Title'] = DataDf['Article Title'].apply(preprocess_text)

# Print the preprocessed DataFrame
DataDf


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,Article Title,Category
0,johannesburg highlights african art fourth ann...,Art
1,decade iraq replace iconic saddam statue monum...,Art
2,video project late amal kenawy showcased beiru...,Art
3,egypt paintings might fetch record price chris...,Art
4,sleeping beauty recreated kiev museum ukrainia...,Art
...,...,...
29935,egypt imposes three month export ban cooking o...,economy
29936,egypt bans export wheat staples 3 months egypt...,economy
29937,egypt annual inflation hit 10 february highest...,economy
29938,interview ebrd supports egypt green transforma...,economy


#Lemmatizer

In [8]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to preprocess text and perform lemmatization
def preprocess_and_lemmatize(text):
    # Tokenize the text
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # Lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Return the lemmatized text
    return ' '.join(lemmatized_tokens)

# Apply preprocessing and lemmatization to the 'text_column'
DataDf['Article Title'] = DataDf['Article Title'].apply(preprocess_and_lemmatize)

# Print the preprocessed DataFrame
DataDf

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Article Title,Category
0,johannesburg highlight african art fourth annu...,Art
1,decade iraq replace iconic saddam statue monum...,Art
2,video project late amal kenawy showcased beiru...,Art
3,egypt painting might fetch record price christ...,Art
4,sleeping beauty recreated kiev museum ukrainia...,Art
...,...,...
29935,egypt imposes three month export ban cooking o...,economy
29936,egypt ban export wheat staple 3 month egypt mi...,economy
29937,egypt annual inflation hit 10 february highest...,economy
29938,interview ebrd support egypt green transformat...,economy


#testing different embadears

#TF-IDY

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

TF_Df = pd.DataFrame(DataDf)

# # Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# # Fit the vectorizer to the text data and transform the text data into TF-IDF vectors
tfidf_vectors = tfidf_vectorizer.fit_transform(TF_Df['Article Title'])

X = tfidf_vectors

y = TF_Df["Category"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [11]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# Initialize SVM classifier
svm_classifier = SVC(kernel='rbf')  # You can choose different kernels, such as 'linear', 'rbf', 'poly', etc.

# Train the classifier on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9891449565798264


#spacy

In [12]:
import pandas as pd
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

spacyDf = pd.DataFrame(DataDf)

# Function to get spaCy embeddings
def get_spacy_embeddings(text):
    doc = nlp(text)
    # Summarize the embeddings by averaging
    return doc.vector

# Apply the function to each row in the DataFrame
spacy_embeddings = spacyDf['Article Title'].apply(get_spacy_embeddings)

spacy_embeddings

0        [-0.115409315, -0.86550903, 0.03866599, 0.4845...
1        [-0.3767882, -0.8503333, 0.06324949, 0.1484868...
2        [-0.038242273, -0.9201352, -0.19139674, 0.2892...
3        [-0.25687468, -0.64090514, 0.007944104, 0.2796...
4        [-0.19098614, -0.4051015, -0.13616051, 0.25890...
                               ...                        
29935    [-0.08746883, -0.36719945, -0.17940418, 0.1722...
29936    [-0.035680827, -0.6309122, -0.0062389425, 0.21...
29937    [-0.18242499, -0.43050715, -0.06770115, 0.0166...
29938    [-0.23525289, -0.7031529, 0.029996332, 0.22292...
29939    [-0.095927395, -0.7404859, 0.014485272, 0.2754...
Name: Article Title, Length: 29940, dtype: object

In [13]:
# Split the data into features (embeddings) and target
X = pd.DataFrame(spacy_embeddings.tolist())  # Features (embeddings)
y = spacyDf['Category']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print shapes of the resulting sets
print("Train set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)
print(X_train)
print(y_train)

Train set shape: (23952, 96) (23952,)
Test set shape: (5988, 96) (5988,)
             0         1         2         3         4         5         6   \
17295  0.057056 -0.514621 -0.019954  0.291320 -0.089189 -0.072065  0.061422   
7063  -0.241559 -0.748635  0.048863  0.018620  0.025769  0.039727  0.207214   
29094 -0.257065 -0.316527 -0.196564  0.182522  0.064196 -0.002803  0.374882   
18251 -0.279450 -0.537470  0.085422  0.480163  0.148699 -0.256577 -0.098490   
8227  -0.207340 -0.719431 -0.245174  0.071377 -0.266298 -0.271394  0.157560   
...         ...       ...       ...       ...       ...       ...       ...   
29802  0.062903 -0.439317  0.039804 -0.169990 -0.230089  0.058156  0.152797   
5390  -0.078492 -0.419708  0.031606 -0.191437  0.068464 -0.395936 -0.074352   
860   -0.145827 -0.924279 -0.152689  0.211990 -0.177415 -0.093060  0.232880   
15795 -0.087176 -0.880858 -0.062547  0.501543 -0.112643 -0.237240  0.028560   
23654 -0.235277 -0.632100  0.006951  0.086772 -0.367740 -0

In [14]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# Initialize SVM classifier
svm_classifier = SVC(kernel='rbf')  # You can choose different kernels, such as 'linear', 'rbf', 'poly', etc.

# Train the classifier on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.802939211756847


#FastText

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
import gensim.downloader

# Download the FastText model
fasttext_model = gensim.downloader.load('fasttext-wiki-news-subwords-300')

fastTextDf = pd.DataFrame(DataDf)

# Function to compute FastText embeddings for text
def get_fasttext_embeddings(text):
    tokens = text.split()  # Tokenize the text
    embeddings = [fasttext_model[word] for word in tokens if word in fasttext_model.key_to_index]  # Get embeddings for each token
    if embeddings:
        return pd.Series(embeddings).mean()  # Average the embeddings and return
    else:
        return pd.Series([0] * fasttext_model.vector_size)  # Return zeros if no embeddings found

# Compute FastText embeddings for each text in the text column
df_embeddings = fastTextDf['Article Title'].apply(get_fasttext_embeddings)

# Create a new DataFrame with the embeddings
df_fasttext = pd.DataFrame(df_embeddings.tolist())



In [16]:

# Split the data into features (X) and the target (Y)
X = df_fasttext
Y = fastTextDf['Category']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Print shapes of the resulting sets
print("Train set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Train set shape: (23952, 300) (23952,)
Test set shape: (5988, 300) (5988,)


In [17]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# Initialize SVM classifier
svm_classifier = SVC(kernel='rbf')  # You can choose different kernels, such as 'linear', 'rbf', 'poly', etc.

# Train the classifier on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9861389445557782


#sentence-transformer

In [19]:
!pip install sentence-transformers



In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

# Load a pre-trained sentence transformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

senTDf = pd.DataFrame(DataDf)

# Function to compute sentence embeddings for text
def get_sentence_embeddings(text):
    return model.encode(text)

# Compute sentence embeddings for each text in the text column
senT_embeddings = senTDf['Article Title'].apply(get_sentence_embeddings)

# Create a new DataFrame with the embeddings
df_sentence_transformers = pd.DataFrame(senT_embeddings.tolist())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:
# Split the data into features (X) and the target (Y)
X = df_sentence_transformers
Y = senTDf['Category']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Print shapes of the resulting sets
print("Train set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Train set shape: (23952, 384) (23952,)
Test set shape: (5988, 384) (5988,)


In [22]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# Initialize SVM classifier
svm_classifier = SVC(kernel='rbf')  # You can choose different kernels, such as 'linear', 'rbf', 'poly', etc.

# Train the classifier on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9884769539078156


#word2vec

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import gensim.downloader

# Download a pre-trained Word2Vec model (GoogleNews-vectors-negative300)
word2vec_model = gensim.downloader.load('word2vec-google-news-300')

word2vecDf = pd.DataFrame(DataDf)

# Function to compute Word2Vec embeddings for text
def get_word2vec_embeddings(text):
    words = text.split()  # Tokenize the text
    embeddings = [word2vec_model[word] for word in words if word in word2vec_model.key_to_index]  # Get embeddings for each word
    if embeddings:
        return pd.Series(embeddings).mean()  # Average the embeddings and return
    else:
        return pd.Series([0] * 300)  # Return zeros if no embeddings found (assuming 300-dimensional embeddings)

# Compute Word2Vec embeddings for each text in the text column
word2vec_embeddings = word2vecDf['Article Title'].apply(get_word2vec_embeddings)

# Create a new DataFrame with the embeddings
df_word2vec = pd.DataFrame(word2vec_embeddings.tolist())

In [25]:
# Split the data into features (X) and the target (Y)
X = df_word2vec
Y = word2vecDf['Category']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Print shapes of the resulting sets
print("Train set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Train set shape: (23952, 300) (23952,)
Test set shape: (5988, 300) (5988,)


In [26]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# Initialize SVM classifier
svm_classifier = SVC(kernel='rbf')  # You can choose different kernels, such as 'linear', 'rbf', 'poly', etc.

# Train the classifier on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9868069472277889
