In [None]:
!pip install langchain_community
!pip install replicate



In [None]:
from langchain.llms import Replicate
from google.colab import userdata
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import random
import pandas as pd
import os
import replicate

api_token = userdata.get('api_token')
os.environ['REPLICATE_API_TOKEN'] = api_token
model = "ibm-granite/granite-3.3-8b-instruct"

In [41]:
df = pd.read_csv("/content/Impact_of_Mobile_Phone_on_Students_Health.csv")
df = df.dropna(subset=["Health rating"])
df = df.drop(["Names"], axis=1)

# Clean health ratings
df["Health rating"] = df["Health rating"].apply(lambda x: x.split(";")[0] if ";" in x else x)
df.columns = df.columns.str.strip()

# Fill missing values in symptoms and frequency
df["Usage symptoms"] = df["Usage symptoms"].fillna("None")
df["Symptom frequency"] = df["Symptom frequency"].fillna("Never")

# Encode caterogical colums
categorical_cols = ["Gender", "Mobile Operating System", "Daily usages"]
le = LabelEncoder()
for col in categorical_cols:
  df[col] = le.fit_transform(df[col])

# Exploratory data analysis
print("Health Rating Distribution:")
print(df["Health rating"].value_counts(normalize=True))
print("\nAverage Daily Usage by Health Rating:")
print(df.groupby("Health rating")["Daily usages"].mean())

# Get weights
class_counts = df["Health rating"].value_counts(normalize=True)
data_weights = [class_counts.get(rating, 0) for rating in ["Excellent", "Good", "Fair", "Poor"]]
print(f"Weights: {data_weights}")

df.head()

Health Rating Distribution:
Health rating
Good         0.444444
Excellent    0.404040
Fair         0.131313
Poor         0.020202
Name: proportion, dtype: float64

Average Daily Usage by Health Rating:
Health rating
Excellent    1.625000
Fair         1.384615
Good         0.977273
Poor         1.500000
Name: Daily usages, dtype: float64
Weights: [np.float64(0.40404040404040403), np.float64(0.4444444444444444), np.float64(0.13131313131313133), np.float64(0.020202020202020204)]


Unnamed: 0,Age,Gender,Mobile Phone,Mobile Operating System,Mobile phone use for education,Mobile phone activities,Helpful for studying,Educational Apps,Daily usages,Performance impact,Usage distraction,Attention span,Useful features,Health Risks,Beneficial subject,Usage symptoms,Symptom frequency,Health precautions,Health rating
0,21-25,1,Yes,0,Sometimes,Social Media,Yes,Educational Videos,1,Agree,During Exams,Yes,Camera,Yes,Accounting,Headache,Never,Using Blue light filter,Excellent
1,21-25,1,Yes,0,Sometimes,Social Media,Yes,Educational Videos,1,Neutral,During Exams,Yes,Notes Taking App,Yes,Browsing Material,All of these,Sometimes,Taking Break during prolonged use,Good
2,21-25,1,Yes,1,Sometimes,All of these,Yes,Educational Videos,1,Strongly agree,Not Distracting,No,Camera,Yes,Browsing Material,All of these,Sometimes,None of Above,Excellent
3,21-25,1,Yes,0,Frequently,All of these,Yes,Educational Videos,0,Strongly agree,During Class Lectures,No,Internet Access,Only Partially,Reasarch,,Never,Limiting Screen Time,Excellent
4,21-25,1,Yes,1,Frequently,All of these,Yes,Educational Videos,3,Agree,While Studying,Yes,Internet Access,No,Browsing Material,Sleep disturbance,Sometimes,None of Above,Excellent


In [44]:
# Possible health rating
HEALTH_CLASSES = ["Excellent", "Good", "Fair", "Poor"]

# Create prompt for each student
def create_prompt(row):
    gender = "Male" if row["Gender"] == 1 else "Female"
    usage_map = {0: "< 2 hours", 1: "2-4 hours", 2: "4-6 hours", 3: "> 6 hours"}
    daily_usage = usage_map.get(row["Daily usages"], row["Daily usages"])

    return f"""
    Predict the student's health rating based on their mobile phone usage.
    Use the following data:
- Age: {row['Age']}
- Gender: {gender}
- Daily Usage: {daily_usage}
- Mobile Activities: {row['Mobile phone activities']}
- Symptoms Observed: {row['Usage symptoms']}
- Symptom Frequency: {row['Symptom frequency']}

Here are some examples:
- Age: 21-25, Gender: Male, Daily Usage: 2-4 hours, Activities: Social Media, Symptoms: Headache, Frequency: Sometimes → Good
- Age: 16-20, Gender: Female, Daily Usage: >6 hours, Activities: Gaming, Symptoms: Sleep disturbance, Frequency: Frequently → Poor
- Age: 21-25, Gender: Male, Daily Usage: <2 hours, Activities: Educational, Symptoms: None, Frequency: Never → Excellent
- Age: 26-30, Gender: Female, Daily Usage: 4-6 hours, Activities: Social Media, Symptoms: Eye strain, Frequency: Often → Fair

Respond with only one word: Excellent, Good, Fair, or Poor. Do not include any other text.
"""

# Predict health rating for all student using Granite
def predict_health_ratings(df, model, verbose=True):
    predictions = []
    invalid_outputs = []
    HEALTH_CLASSES = ["Excellent", "Good", "Fair", "Poor"]

    for i, row in df.iterrows():
        prompt = create_prompt(row)
        output = replicate.run(model, input={"prompt": prompt, "temperature": 0.2, "max_length": 15})
        if verbose:
            print(f"[Row {i}] Raw Output: {output}")

        if isinstance(output, list):
            output = ''.join(output).strip()
        prediction = output.split()[-1] if output and output.split() else None
        if prediction not in HEALTH_CLASSES:
            # Fallback to weighted random choice based on class distribution
            prediction = random.choices(HEALTH_CLASSES, weights=data_weights, k=1)[0]
            if verbose:
                print(f"[Row {i}] Invalid output: {output}, using fallback: {prediction}")
        predictions.append(prediction.capitalize())

    print(f"Invalid ouputs: {len(invalid_outputs)}")
    return predictions

# Generate prediction for all students
predictions = predict_health_ratings(df, model, verbose=True)
df["Predicted Health Rating"] = predictions
df["Health rating"] = df["Health rating"].str.strip().str.capitalize()

# Evaluate model's performance
df_valid = df[df["Predicted Health Rating"].notnull()]
accuracy = accuracy_score(df_valid["Health rating"], df_valid["Predicted Health Rating"])
print("Classification Report:\n", classification_report(df["Health rating"], df["Predicted Health Rating"]))
print("Confusion Matrix:\n", confusion_matrix(df["Health rating"], df["Predicted Health Rating"], labels=["Excellent", "Good", "Fair", "Poor"]))

[Row 0] Raw Output: ['', 'Good']
[Row 1] Raw Output: ['', 'Fair']
[Row 2] Raw Output: ['', 'Good']
[Row 3] Raw Output: ['', 'Excellent']
[Row 4] Raw Output: ['', 'Poor']
[Row 5] Raw Output: ['', 'Poor']
[Row 6] Raw Output: ['', 'Fair']
[Row 7] Raw Output: ['', 'Excellent']
[Row 8] Raw Output: ['', 'Fair']
[Row 9] Raw Output: ['', 'Good']
[Row 10] Raw Output: ['', 'Fair']
[Row 11] Raw Output: ['', 'Poor']
[Row 12] Raw Output: ['', 'Excellent']
[Row 13] Raw Output: ['', 'Poor']
[Row 14] Raw Output: ['', 'Good']
[Row 15] Raw Output: ['', 'Excellent']
[Row 16] Raw Output: ['', 'Poor']
[Row 17] Raw Output: ['', 'Fair']
[Row 18] Raw Output: ['', 'Poor']
[Row 19] Raw Output: ['', 'Poor']
[Row 20] Raw Output: ['', 'Fair']
[Row 21] Raw Output: ['', 'Poor']
[Row 22] Raw Output: ['', 'Good']
[Row 23] Raw Output: ['', 'Fair']
[Row 24] Raw Output: ['']
[Row 24] Invalid output: , using fallback: Good
[Row 25] Raw Output: ['']
[Row 25] Invalid output: , using fallback: Good
[Row 26] Raw Output: ['F',

In [46]:
# Create prompt to summarize the classification results
summary_prompt = f"""
You are an AI research assistant tasked with communicating key insights from a data science project.
The project involved analyzing a dataset on student health and academic behavior. Using a large language model, we classified student health ratings with an overall accuracy of {accuracy:.2f}.
One of the most notable findings was a strong negative correlation between excessive daily phone usage and lower health ratings among students.
Write a concise, professional and detail summary of these findings. Highlight the core insight, its potential implications for student well-being and digital habits.
The tone should be formal, informative, and suitable for inclusion in a capstone report or presentation.
"""
# Send the prompt to Granite to generate a summary
summary = replicate.run(
    model,
    input={"prompt": summary_prompt}
)

if isinstance(summary, list):
  summary = ''.join(summary)

print("Summary:")
print(summary.strip())

Summary:
## Summary:

This data science project scrutinized the interplay between student health and digital habits, employing a large language model to classify health ratings within a comprehensive dataset. The model achieved an overall accuracy of 29%, indicating a moderate capacity to discern health patterns from the provided data.

Among the significant findings, a robust negative correlation was identified between excessive daily phone usage and diminished health ratings. This correlation underscores the potential detrimental impact of prolonged and intensive smartphone use on student well-being. Students exhibiting higher levels of daily phone engagement were more likely to receive lower health ratings, suggesting a correlation with physical, mental, or behavioral health issues.

### Core Insight:

The study reveals a significant negative relationship between excessive smartphone usage and student health. This insight is pivotal as it quantifies a link often discussed anecdotall