In [None]:
#Q1 scatter plot on iris
import pandas as pd
import matplotlib.pyplot as plt
iris_data = pd.read_csv("/content/iris.csv")
print(iris_data.head())
x = iris_data['sepal.length']
y = iris_data['sepal.width']
species = iris_data['variety']
plt.figure(figsize=(10, 6))
for sp in species.unique():
    sp_data = iris_data[species == sp]
    plt.scatter(sp_data['sepal.length'], sp_data['sepal.width'], label=sp)
plt.title('Scatter Plot of Iris Dataset')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
#Q2 find null values and replace with mean
import pandas as pd
from sklearn.impute import SimpleImputer
iris_data = pd.read_csv("/content/iris.csv")
print("Original Data:")
print(iris_data.head())
print("\nNull values in each column before imputation:")
print(iris_data.isnull().sum())

total_null_values = iris_data.isnull().sum().sum()
print("\nTotal number of null values in the dataset:", total_null_values)

imputer = SimpleImputer(strategy='mean')
iris_data_imputed = pd.DataFrame(imputer.fit_transform(iris_data.iloc[:, :-1]), columns=iris_data.columns[:-1])
iris_data_imputed['variety'] = iris_data['variety']
print("\nData after imputation:")
print(iris_data_imputed.head())
print("\nNull values in each column after imputation:")
print(iris_data_imputed.isnull().sum())


In [None]:
#Q3 convert categorical values to numeric format in a given dataset using label encoding and one hot encoder
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
iris_data = pd.read_csv("/content/iris.csv")
print("Original Data:")
print(iris_data.head())
label_encoder = LabelEncoder()
iris_data['Species_Label'] = label_encoder.fit_transform(iris_data['variety'])
print("\nData with Label Encoded Species:")
print(iris_data.head())
onehot_encoder = OneHotEncoder(sparse_output=False)
species_onehot = onehot_encoder.fit_transform(iris_data[['variety']])
species_onehot_df = pd.DataFrame(species_onehot, columns=onehot_encoder.get_feature_names_out(['variety']))
iris_data = pd.concat([iris_data, species_onehot_df], axis=1)
print("\nData with One-Hot Encoded Species:")
print(iris_data.head())


In [None]:
#Q4 scale values
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Load the dataset
salary_data = pd.read_csv("/content/salary.csv")

print("Original Data:")
print(salary_data.head())

imputer = SimpleImputer(strategy="mean")
salary_data[['Salary']] = imputer.fit_transform(salary_data[['Salary']])

categorical_columns = salary_data.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    salary_data[col] = label_encoders[col].fit_transform(salary_data[col])

features = salary_data.drop('Salary', axis=1)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

scaled_features_df = pd.DataFrame(scaled_features, columns=features.columns)

scaled_salary_data = pd.concat([scaled_features_df, salary_data['Salary']], axis=1)

print("\nScaled Data:")
print(scaled_salary_data.head())


In [None]:
#Set B
#1 split data into training and test set
import pandas as pd
from sklearn.model_selection import train_test_split
iris_data = pd.read_csv("/content/iris.csv")
print("Original Data:")
print(iris_data.head())
X = iris_data.drop('variety', axis=1)
y = iris_data['variety']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nTraining set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)
print("\nTraining set:")
print(X_train.head(), y_train.head())
print("\nTest set:")
print(X_test.head(), y_test.head())


In [None]:
#Set B
# 2 scale features using standardization
import pandas as pd
from sklearn.preprocessing import StandardScaler
data = pd.read_csv("/content/salary.csv")
print("Original Data:")
print(data.head())
features = data.drop('Target', axis=1, errors='ignore')
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
scaled_features_df = pd.DataFrame(scaled_features, columns=features.columns)
if 'Target' in data.columns:
    scaled_data = pd.concat([scaled_features_df, data['Target']], axis=1)
else:
    scaled_data = scaled_features_df
print("Scaled Data:")
print(scaled_data.head())

