In [1]:
import numpy as np

In [2]:
# creating a np array
arr = np.array([1,2,3,4,5])
print("Array:",arr)



Array: [1 2 3 4 5]


In [3]:
# Reshaping an array
matrix = arr.reshape(5,1)
print("Reshaped Matrix:\n", matrix)

Reshaped Matrix:
 [[1]
 [2]
 [3]
 [4]
 [5]]


In [4]:
# Performing mathematical operations
arr_squared = arr ** 2
print("Squared Array:", arr_squared)

Squared Array: [ 1  4  9 16 25]


In [5]:
# Generating a random matrix
random_matrix = np.random.rand(3,3)
print("Random Matrix:\n", random_matrix)

Random Matrix:
 [[0.29567236 0.76440365 0.98712722]
 [0.77204409 0.09687235 0.36061115]
 [0.05933378 0.74478696 0.66389972]]


## Working with pandas

In [6]:
import pandas as pd


In [8]:
# Creating a DataFrame
data = {'Name': ['bob','me', 'you'], 'Age': [ 23, 45, 19], 'Salary': [ 100000, 13000, 90000]}
df = pd.DataFrame(data)
print("DataFrame:\n",df)

# Reading data from a CSV file
# df = pd.read_csv("data.csv")

DataFrame:
   Name  Age  Salary
0  bob   23  100000
1   me   45   13000
2  you   19   90000


In [9]:
# Display the first 5 rows
print(df.head())

  Name  Age  Salary
0  bob   23  100000
1   me   45   13000
2  you   19   90000


In [10]:
# Descriptive statistics of numerical columns
print(df.describe())

        Age         Salary
count   3.0       3.000000
mean   29.0   67666.666667
std    14.0   47606.022028
min    19.0   13000.000000
25%    21.0   51500.000000
50%    23.0   90000.000000
75%    34.0   95000.000000
max    45.0  100000.000000


In [11]:
# Handling missing values
df.fillna(0, inplace=True)

In [13]:
# Filtering data
filtered_df = df[df["Age"]> 30]
print("Filtered Data:\n", filtered_df)

Filtered Data:
   Name  Age  Salary
1   me   45   13000


### Scikit-Learn basics

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
# Sample datay
X = [[1], [2],[3],[4],[5],[1],[2],[3],[4],[5],[1], [2],[3],[4],[5],[1],[2],[3],[4],[5]]
y = [1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5] # Labels

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# Train a model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


### Feature Engineering & Data Preprocessing

In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler


In [6]:
# Sample dataset
data = {'Age': [23, 30, 35, 42, None], 'Salary': [14000, 60000, 120000, None, 5], 'Gender':['Guy', 'Lady','Lady','Guy', 'Guy']}
df = pd.DataFrame(data)

# Handling missing values (fill with mean)
df["Age"]  = df["Age"].fillna(df["Age"].mean())
df["Salary"] = df["Salary"].fillna(df["Salary"].mean())

In [8]:
# Encoding caegorical variable
encoder = OneHotEncoder(sparse_output=False)
gender_encoded = encoder.fit_transform(df[['Gender']])
df_gender = pd.DataFrame(gender_encoded, columns=['Guy', 'Lady'])

In [9]:
# Scaling numerical features
scaler = StandardScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age','Salary']])

In [10]:
# Concatenatng final processed data
df_final = pd.concat([df[['Age', 'Salary']], df_gender], axis=1)
print("Preprocessed Data:\n",df_final)

Preprocessed Data:
         Age    Salary  Guy  Lady
0 -1.529079 -0.823022  1.0   0.0
1 -0.402389  0.274301  0.0   1.0
2  0.402389  1.705592  0.0   1.0
3  1.529079  0.000000  1.0   0.0
4  0.000000 -1.156871  1.0   0.0
