In [1]:
#1. Linear Regression - Sales Dataset

#(Slip No. 1, 12 - Identical)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Set random seed for reproducibility
np.random.seed(42)

# Create the 'sales' dataset with 500 random entries
data = {
    'ID': np.arange(1, 501),
    'TV': np.random.randint(10, 300, 500),
    'Radio': np.random.randint(5, 100, 500),
    'Newspaper': np.random.randint(0, 50, 500),
    'Sales': np.random.randint(50, 300, 500)
}
df = pd.DataFrame(data)
print("Dataset Sample (first 5 rows):")
print(df.head())

# Define independent (X) and target (y) variables
X = df[['TV', 'Radio', 'Newspaper']]  # Independent variables
y = df['Sales']  # Target variable

# Split into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Build and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print model coefficients and performance metrics
print("\nModel Coefficients:")
print(f"TV coefficient: {model.coef_[0]:.4f}")
print(f"Radio coefficient: {model.coef_[1]:.4f}")
print(f"Newspaper coefficient: {model.coef_[2]:.4f}")
print(f"Intercept: {model.intercept_:.4f}")

print("\nModel Performance:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")

Dataset Sample (first 5 rows):
   ID   TV  Radio  Newspaper  Sales
0   1  112     46         32    108
1   2  280     13         37    121
2   3  116     54         12    197
3   4   81     31         30    142
4   5  198     70         46    112

Model Coefficients:
TV coefficient: 0.0990
Radio coefficient: 0.1756
Newspaper coefficient: 0.1176
Intercept: 144.1505

Model Performance:
Mean Squared Error: 5218.3077
R-squared: -0.0001


In [2]:
#2. Linear Regression - Real Estate Dataset

#(Slip No. 4, 15 - Identical)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Set random seed for reproducibility
np.random.seed(42)

# Create the 'realestate' dataset with 500 random entries
data = {
    'ID': np.arange(1, 501),
    'flat': np.random.randint(10, 100, 500),
    'houses': np.random.randint(5, 50, 500),
    'purchases': np.random.randint(50, 200, 500)
}
df = pd.DataFrame(data)

# Define independent (X) and target (y) variables
X = df[['flat', 'houses']]  # Independent variables
y = df['purchases']  # Target variable

# Split into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Build and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print model coefficients
print("\nModel Coefficients:")
print(f"Flat coefficient: {model.coef_[0]:.4f}")
print(f"Houses coefficient: {model.coef_[1]:.4f}")
print(f"Intercept: {model.intercept_:.4f}")


Model Coefficients:
Flat coefficient: 0.0684
Houses coefficient: 0.1469
Intercept: 115.3210


In [3]:
#3. Logistic Regression - User Dataset

#(Slip No. 2, 18 - Identical)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Create 'User' dataset with 500 entries
np.random.seed(42)
data = {
    'User ID': np.arange(1, 501),
    'Gender': np.random.choice(['Male', 'Female'], 500),
    'Age': np.random.randint(18, 60, 500),
    'EstimatedSalary': np.random.randint(20000, 120000, 500),
    'Purchased': np.random.choice([0, 1], 500)
}
df = pd.DataFrame(data)

# Convert categorical column to numeric
encoder = LabelEncoder()
df['Gender'] = encoder.fit_transform(df['Gender'])

# Define independent (X) and target (y) variables
X = df[['Gender', 'Age', 'EstimatedSalary']]
y = df['Purchased']

# Split into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Build and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.4533


In [4]:
#4. Apriori Algorithm - Market Basket Analysis

#(Slip No. 5, 7, 9, 10, 13, 16, 19, 20 - Identical)

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Define the dataset
dataset = [
    ['Bread', 'Milk'],
    ['Bread', 'Diaper', 'Beer', 'Eggs'],
    ['Milk', 'Diaper', 'Beer', 'Coke'],
    ['Bread', 'Milk', 'Diaper', 'Beer'],
    ['Bread', 'Milk', 'Diaper', 'Coke']
]

# Convert categorical values to numeric format (one-hot encoding)
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)

# Apply Apriori algorithm with different min_support values
min_support_values = [0.6, 0.4, 0.2]  # Testing multiple thresholds
for min_sup in min_support_values:
    frequent_itemsets = apriori(df, min_support=min_sup, use_colnames=True)
    print(f"\nFrequent itemsets with min_support={min_sup}:")
    print(frequent_itemsets)

    # Generate association rules
    if not frequent_itemsets.empty:
        rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
        print("\nAssociation Rules:")
        print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


Frequent itemsets with min_support=0.6:
   support         itemsets
0      0.6           (Beer)
1      0.8          (Bread)
2      0.8         (Diaper)
3      0.8           (Milk)
4      0.6   (Beer, Diaper)
5      0.6  (Bread, Diaper)
6      0.6    (Bread, Milk)
7      0.6   (Diaper, Milk)

Association Rules:
  antecedents consequents  support  confidence    lift
0      (Beer)    (Diaper)      0.6        1.00  1.2500
1    (Diaper)      (Beer)      0.6        0.75  1.2500
2     (Bread)    (Diaper)      0.6        0.75  0.9375
3    (Diaper)     (Bread)      0.6        0.75  0.9375
4     (Bread)      (Milk)      0.6        0.75  0.9375
5      (Milk)     (Bread)      0.6        0.75  0.9375
6    (Diaper)      (Milk)      0.6        0.75  0.9375
7      (Milk)    (Diaper)      0.6        0.75  0.9375

Frequent itemsets with min_support=0.4:
    support               itemsets
0       0.6                 (Beer)
1       0.8                (Bread)
2       0.4                 (Coke)
3       0.8

In [5]:
#5. Text Summarization - Extractive Approach

#(Slip No. 8 - Unique)

import nltk
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict

nltk.download('punkt')
nltk.download('stopwords')

text = """Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans using natural language. NLP techniques enable computers to process and understand human language, allowing applications such as speech recognition, machine translation, and sentiment analysis. The field has seen significant advancements due to deep learning and large-scale language models."""

# Clean text
clean_text = re.sub(r'[^a-zA-Z\s]', '', text)
sentences = sent_tokenize(clean_text)

# Calculate word frequency
stop_words = set(stopwords.words('english'))
word_freq = defaultdict(int)
for sentence in sentences:
    words = word_tokenize(sentence.lower())
    for word in words:
        if word not in stop_words:
            word_freq[word] += 1

# Score sentences
sentence_scores = {i: sum(word_freq[word] for word in word_tokenize(sent.lower()) if word in word_freq) for i, sent in enumerate(sentences)}

# Select top sentences
top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:2]
summary = " ".join(sentences[i] for i in sorted(top_sentences))

print("Extractive Summary:\n", summary)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [7]:
#6. Linear Regression for Fish Species Weight Prediction

#(Using Fish Market Dataset from Kaggle)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
file_path = "/mnt/data/file-UALYDjz2i1yWVGCGNFMdWL"  # Update with your actual file path
df = pd.read_csv(file_path)

# Display first few rows
print("Dataset Sample:")
print(df.head())

# Selecting features and target variable
X = df[['Length1', 'Length2', 'Length3', 'Height', 'Width']]  # Features
y = df['Weight']  # Target variable

# Splitting dataset (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Model evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print("\nModel Coefficients:")
print(f"Intercept: {model.intercept_:.4f}")
for feature, coef in zip(X.columns, model.coef_):
    print(f"{feature}: {coef:.4f}")

print("\nModel Performance:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/file-UALYDjz2i1yWVGCGNFMdWL'

In [8]:
#1Logistic Regression on the Iris Dataset

#(Statistical Summary + Classification)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import datasets

# Load the Iris dataset
iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target

# Convert target numbers to species names
df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

# Display basic statistical details
print("Basic Statistics:\n")
print(df.groupby('species').describe())

# Prepare data for logistic regression
X = df.iloc[:, :-1]  # Features: Sepal/Petal lengths & widths
y = df['species']  # Target variable

# Encode labels (setosa=0, versicolor=1, virginica=2)
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

# Split into train and test sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Train Logistic Regression model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Predict species
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

Basic Statistics:

           sepal length (cm)                                              \
                       count   mean       std  min    25%  50%  75%  max   
species                                                                    
setosa                  50.0  5.006  0.352490  4.3  4.800  5.0  5.2  5.8   
versicolor              50.0  5.936  0.516171  4.9  5.600  5.9  6.3  7.0   
virginica               50.0  6.588  0.635880  4.9  6.225  6.5  6.9  7.9   

           sepal width (cm)         ... petal length (cm)       \
                      count   mean  ...               75%  max   
species                             ...                          
setosa                 50.0  3.428  ...             1.575  1.9   
versicolor             50.0  2.770  ...             4.600  5.1   
virginica              50.0  2.974  ...             5.875  6.9   

           petal width (cm)                                            
                      count   mean       std  min  25% 