In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from google.colab import drive


# **Iris** **dataset**


In [None]:

# File ID from Google Drive share link
file_id = "1MFh6af9VQScrSrw5zUChjqNk3VDD5M0C"
file_url = f"https://drive.google.com/uc?id={file_id}"

df = pd.read_csv(file_url)
print("Data loaded successfully!")
display(df)


Data loaded successfully!


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [None]:
display(df.isnull().sum())

Unnamed: 0,0
sepal_length,0
sepal_width,0
petal_length,0
petal_width,0
species,0


In [None]:
F = df.drop('species', axis=1)
T1 = df['species']

### Label encoding and Standrization

In [None]:
from sklearn.preprocessing import LabelEncoder

numerical_features = []

for col in F.columns:
    numerical_features.append(col)
# Encode target with LabelEncoder
label_encoder = LabelEncoder()
T = label_encoder.fit_transform(T1)

print("Numerical Features:", numerical_features)
print("Encoded Target Classes:", list(label_encoder.classes_))
print("Encoded Target Values (target):", T)
T.sum()

Numerical Features: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
Encoded Target Classes: ['setosa', 'versicolor', 'virginica']
Encoded Target Values (target): [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


np.int64(150)

In [None]:
scaler = StandardScaler()
F_scaled = scaler.fit_transform(F[numerical_features])

display(F_scaled[:5])


array([[-0.90068117,  1.03205722, -1.3412724 , -1.31297673],
       [-1.14301691, -0.1249576 , -1.3412724 , -1.31297673],
       [-1.38535265,  0.33784833, -1.39813811, -1.31297673],
       [-1.50652052,  0.10644536, -1.2844067 , -1.31297673],
       [-1.02184904,  1.26346019, -1.3412724 , -1.31297673]])

In [None]:
F_processed = pd.DataFrame(F_scaled, columns=numerical_features)
T_processed = pd.DataFrame(T, columns=['species'])
print("All features are numerical. Using scaled numerical features as final processed data.")

# Show processed data
display(F_processed.head())
display(T_processed.head())



All features are numerical. Using scaled numerical features as final processed data.


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.900681,1.032057,-1.341272,-1.312977
1,-1.143017,-0.124958,-1.341272,-1.312977
2,-1.385353,0.337848,-1.398138,-1.312977
3,-1.506521,0.106445,-1.284407,-1.312977
4,-1.021849,1.26346,-1.341272,-1.312977


Unnamed: 0,species
0,0
1,0
2,0
3,0
4,0


### Spilitting and Saving the Data

In [None]:
F_train, F_test, T_train, T_test = train_test_split(F_processed, T_processed, test_size=0.2, random_state=42)


In [None]:
F_train.to_csv('irisFeatures_train_processed.csv', index=False)
F_test.to_csv('irisFeatures_test_processed.csv', index=False)
T_train.to_csv('irisTarget_train.csv', index=False)
T_test.to_csv('irisTarget_test.csv', index=False)

print("Processed data saved successfully!")

Processed data saved successfully!


# **Sentiment** **dataset**

In [None]:
# File ID from Google Drive share link
file_id = "1dsT3NZ0NALmivK7bQxN1X_HRieqIF_aY"
file_url = f"https://drive.google.com/uc?id={file_id}"

df = pd.read_csv(file_url)
print("Data loaded successfully!")
display(df.columns)


Data loaded successfully!


Index(['Unnamed: 0.1', 'Unnamed: 0', 'Text', 'Sentiment', 'Timestamp', 'User',
       'Platform', 'Hashtags', 'Retweets', 'Likes', 'Country', 'Year', 'Month',
       'Day', 'Hour'],
      dtype='object')

In [None]:
display(df.head())

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19


In [None]:
display(df.isnull().sum())

Unnamed: 0,0
Unnamed: 0.1,0
Unnamed: 0,0
Text,0
Sentiment,0
Timestamp,0
User,0
Platform,0
Hashtags,0
Retweets,0
Likes,0


### Feature Selection

In [None]:
x = df.drop(['Unnamed: 0.1', 'Unnamed: 0', 'Sentiment', 'Timestamp', 'User', 'Platform', 'Hashtags', 'Country', 'Year', 'Month', 'Day', 'Hour'], axis=1)
y = df['Sentiment']

### Spilitting and Saving the Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [None]:
X_train.to_csv('SentimentFeatures_train_processed.csv', index=False)
X_test.to_csv('SentimentFeatures_test_processed.csv', index=False)
y_train.to_csv('SentimentTarget_train.csv', index=False)
y_test.to_csv('SentimentTarget_test.csv', index=False)

print("Processed data saved successfully!")

Processed data saved successfully!
