In [2]:
import numpy as np

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def loss(y, y_hat):
    loss = -np.mean(y*(np.log(y_hat)) - (1-y)*np.log(1-y_hat))
    return loss

def gradients(X, y, y_hat):
    
    # X --> Input.
    # y --> true/target value.
    # y_hat --> hypothesis/predictions.
    # w --> weights (parameter).
    # b --> bias (parameter).
    
    # m-> number of training examples.
    m = X.shape[0]
    
    # Gradient of loss w.r.t weights.
    dw = (1/m)*np.dot(X.T, (y_hat - y))
    
    # Gradient of loss w.r.t bias.
    db = (1/m)*np.sum((y_hat - y)) 
    
    return dw, db


def normalize(X, epsilon=1e-8):
    
    # X --> Input.
    
    # m-> number of training examples
    # n-> number of features 
    m, n = X.shape
    
    # Normalizing all the n features of X.
    X = (X - X.mean(axis=0)) / (X.std(axis=0) + epsilon)
        
    return X

def train(X, y, bs, epochs, lr):
    
    # X --> Input.
    # y --> true/target value.
    # bs --> Batch Size.
    # epochs --> Number of iterations.
    # lr --> Learning rate.
        
    # m-> number of training examples
    # n-> number of features 
    m, n = X.shape
    
    # Initializing weights and bias to zeros.
    w = np.zeros((n,1))
    b = 0
    
    # Reshaping y.
    y = y.reshape(m,1)
    
    # Normalizing the inputs.
    x = normalize(X)
    
    # Empty list to store losses.
    losses = []
    
    # Training loop.
    for epoch in range(epochs):
        for i in range((m-1)//bs + 1):
            
            # Defining batches. SGD.
            start_i = i*bs
            end_i = start_i + bs
            xb = X[start_i:end_i]
            yb = y[start_i:end_i]
            
            # Calculating hypothesis/prediction.
            y_hat = sigmoid(np.dot(xb, w) + b)
            
            # Getting the gradients of loss w.r.t parameters.
            dw, db = gradients(xb, yb, y_hat)
            
            # Updating the parameters.
            w -= lr*dw
            b -= lr*db
        
        # Calculating loss and appending it in the list.
        l = loss(y, sigmoid(np.dot(X, w) + b))
        losses.append(l)
        
    # returning weights, bias and losses(List).
    return w, b, losses

def predict(X, w, b): 
    
    # X --> Input.
    
    # Normalizing the inputs.
    x = normalize(X)
    
    # Calculating presictions/y_hat.
    preds = sigmoid(np.dot(x, w) + b)
    
    # Empty List to store predictions.
    pred_class = []
    # if y_hat >= 0.5 --> round up to 1
    # if y_hat < 0.5 --> round up to 1
    pred_class = [1 if i > 0.5 else 0 for i in preds]
    
    return np.array(pred_class)

In [1]:
import pandas as pd

# Load and prepare the data
data = pd.read_csv('data/train_tfidf_features.csv')
X = data.drop(['label', 'id'], axis=1).values
y = data['label'].values

# Manually split the data
np.random.seed(21)  # For reproducibility
indices = np.arange(X.shape[0])
np.random.shuffle(indices)

split_index = int(0.8 * X.shape[0])  # 80% for training, 20% for validation
train_indices = indices[:split_index]
val_indices = indices[split_index:]

X_train, X_val = X[train_indices], X[val_indices]
y_train, y_val = y[train_indices], y[val_indices]

# Train the model
w, b, losses = train(X_train, y_train, bs=17184, epochs=1000, lr=0.01)

# Evaluate the model on the validation set
y_val_pred = predict(X_val, w, b)
accuracy = np.mean(y_val_pred == y_val)

print(f"Validation Accuracy: {accuracy * 100:.2f}%")



NameError: name 'np' is not defined

In [19]:
ts = pd.read_csv('data/test_tfidf_features.csv')
predictions = predict(ts.drop('id', axis=1).values)
print(predictions)

[0 0 0 ... 0 0 0]


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load and prepare the data
data = pd.read_csv('data/train_tfidf_features.csv')
X = data.drop(['label', 'id'], axis=1).values
y = data['label'].values

# Split data into training and validation sets using scikit-learn
np.random.seed(42)  # For reproducibility
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data (scikit-learn's LogisticRegression handles this internally with `StandardScaler`)
# If you want to explicitly normalize:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Train the logistic regression model
model = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)
model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_val_pred)

print(f"Validation Accuracy: {accuracy * 100:.2f}%")


Validation Accuracy: 65.46%
