# This is the note and code for thesis project
# Evaluating the Impact of AI tools on Academic Progress among University Students

In [1]:
## 1. Introduction 

In [2]:
## 2. Literature Review

In [3]:
# Importing necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
plt.style.use('ggplot')

The dataset shall be cleaned through the following processes: 
1. Checking the # of columns
2. Changing mispelt column names to the correct names
3. Checking for missing values
4. Checking for duplicate values
5. Checking for outliers

In [4]:
#Load the data
df = pd.read_excel('./raw_data.xlsx')


# Step 1: Data Cleaning and Preprocessing


In [5]:
# Create a dictionary to map old names to new names
column_name_mapping = {
    'Dấu thời gian': 'timestamp',
    "What's your major ? ": 'major',
    'How frequently do you use AI tools for academic purposes?': 'ai_use_freq',
    'Which AI tools do you use most frequently? ': 'top_ai_tools',
    'What are the primary purposes for which you use AI tools ? ': 'ai_use_purpose',
    'How satisfied are you with the performance of AI tools in general ?': 'ai_satisfaction',
    'Have you encountered any significant challenges while using AI tools ?': 'ai_challenges',
    'If yes, please specify the challenges that you faced ': 'challenges_detail',
    'To what extent have AI tools helped you improve your academic performance?': 'ai_academic_help',
    'Do you feel that AI tools have enhanced your understanding of complex topics?': 'ai_enhance_understanding',
    "Do you find AI tools' outputs to be accurate and reliable ?": 'ai_reliability',
    'How do you integrate AI tools into your study routine? ': 'ai_integration',
    'How confident are you in using AI tools for important academic tasks?': 'ai_confidence',
    'Do you think ChatGPT provides enough context in its responses ? ': 'chatgpt_context',
    'In your opinion, how well does ChatGPT understand and respond to complex topics ? ': 'chatgpt_complex_topics',
    'Would you recommend the use of AI tools to your peers ?': 'ai_recommend',
    'Have you ever used AI tools in a way that might be considered academically dishonest? ': 'ai_dishonest_use',
    'How do you ensure that your use of AI tools remains ethical and honest? ': 'ai_ethical_use',
    'Do you think AI tools encourage or discourage academic honesty?': 'ai_encourage_honesty',
    'Have you received guidance from your institution on the ethical use of AI tools?': 'institution_guidance',
    'In your opinion, what measures can be taken to promote ethical use of AI tools among students ? ': 'promote_ethical_use',
    'Have you ever used ChatGPT to generate content that you submitted as your own work ? ': 'chatgpt_submitted_work',
    'Do you think using ChatGPT for assignments constitutes plagiarism ?': 'chatgpt_plagiarism',
    'Have you experienced any consequences (positive or negative) from using ChatGPT in your academic work ? ': 'chatgpt_consequences',
    'If yes, please describe the consequences you experienced.': 'consequences_detail'
}
# Rename the columns
df.rename(columns=column_name_mapping, inplace=True)

# Verify the new column names
print(df.columns)

Index(['timestamp', 'major', 'ai_use_freq', 'top_ai_tools', 'ai_use_purpose',
       'ai_satisfaction', 'ai_challenges', 'challenges_detail',
       'ai_academic_help', 'ai_enhance_understanding', 'ai_reliability',
       'ai_integration', 'ai_confidence', 'chatgpt_context',
       'chatgpt_complex_topics', 'ai_recommend', 'ai_dishonest_use',
       'ai_ethical_use', 'ai_encourage_honesty', 'institution_guidance',
       'promote_ethical_use', 'chatgpt_submitted_work', 'chatgpt_plagiarism',
       'chatgpt_consequences', 'consequences_detail'],
      dtype='object')


In [6]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
df.head()

Unnamed: 0,timestamp,major,ai_use_freq,top_ai_tools,ai_use_purpose,ai_satisfaction,ai_challenges,challenges_detail,ai_academic_help,ai_enhance_understanding,ai_reliability,ai_integration,ai_confidence,chatgpt_context,chatgpt_complex_topics,ai_recommend,ai_dishonest_use,ai_ethical_use,ai_encourage_honesty,institution_guidance,promote_ethical_use,chatgpt_submitted_work,chatgpt_plagiarism,chatgpt_consequences,consequences_detail
0,2024-06-12 16:03:52.297,CSE,A few times a month,"ChatGPT, Bing Ai","Research assistance, Writing and editing, Lear...",Neutral,,,Somewhat improved,Agree,Quite reliable,Occasionally for quick assistance,4,Yes,3,Probably,No,Carefully reading what the AI say.,Neutral,No,,Maybe,It depends on how I use it.,No,
1,2024-06-12 16:05:21.965,BFA,A few times a month,"ChatGPT, Google Assistant",Research assistance,Satisfied,No,,Somewhat improved,Strongly Agree,Quite reliable,Occasionally for quick assistance,1,No,4,Probably,No,The lecturer can easily figures out if I use A...,Discourage,No,,Maybe,Yes,No,
2,2024-06-12 16:08:19.520,CSE,A few times a month,"ChatGPT, Quillbot","Research assistance, Writing and editing, Lear...",Satisfied,Yes,Everything need to be proof-readed,Somewhat improved,Strongly Disagree,Neutral,As a supplementary tool,4,No,1,Probably,No,Anything academic-related can often be found f...,Neutral,No,"For now, only fact and plagiarism checking",No,Depends on different response,No,
3,2024-06-12 16:12:14.940,CSE,Daily,"ChatGPT, Google Assistant, Microsoft Azure AI","Writing and editing, Coding and debugging",Neutral,No,,Somewhat improved,Strongly Agree,Neutral,As a supplementary tool,4,Yes,2,Definitely,No,Consider AI tool as recommended path to improv...,Neutral,No,AI workshop for better understanding,Yes,No,Yes,Faster works done trade back for worse academi...
4,2024-06-12 16:52:40.595,BBA,Several times a week,ChatGPT,Research assistance,Neutral,Yes,"misinformation, repetitive results for differe...",No impact,Neutral,Rarely,Occasionally for quick assistance,3,No,3,Probably,No,"Well, don't just copy paste a passage without ...",Neutral,No,,No,Doesn't have enough information to answer this...,No,


In [7]:
df2 = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 2, 3, np.nan, 5]
})

print("Original DataFrame:")
print(df2)

# Fill NaN values in column 'A' with its median
df2['A'].fillna(df2['A'].median(), inplace=True)

# Fill NaN values in column 'B' with its median
df2['B'].fillna(df2['B'].median(), inplace=True)

print("\nDataFrame after filling NaN values:")
print(df2)
print(df2['A'].median())
print(df2['B'].median())


Original DataFrame:
     A    B
0  1.0  NaN
1  2.0  2.0
2  NaN  3.0
3  4.0  NaN
4  5.0  5.0

DataFrame after filling NaN values:
     A    B
0  1.0  3.0
1  2.0  2.0
2  3.0  3.0
3  4.0  3.0
4  5.0  5.0
3.0
3.0


In [8]:
# Clean and standardize categorical variables
df['major'] = df['major'].str.strip().str.upper()
df['ai_use_freq'] = df['ai_use_freq'].str.strip().str.lower()
df['ai_satisfaction'] = df['ai_satisfaction'].str.strip().str.lower()
df['ai_academic_help'] = df['ai_academic_help'].str.strip().str.lower()
df['ai_enhance_understanding'] = df['ai_enhance_understanding'].str.strip().str.lower()
df['ai_reliability'] = df['ai_reliability'].str.strip().str.lower()
df['ai_integration'] = df['ai_integration'].str.strip().str.lower()
df['ai_recommend'] = df['ai_recommend'].str.strip().str.lower()
df['ai_encourage_honesty'] = df['ai_encourage_honesty'].str.strip().str.lower()

In [9]:
# Convert boolean columns to actual boolean type
boolean_columns = ['ai_challenges', 'chatgpt_context', 'ai_dishonest_use', 'institution_guidance', 'chatgpt_submitted_work', 'chatgpt_consequences']
for col in boolean_columns:
    df[col] = df[col].map({'Yes': True, 'No': False, 'Maybe': None})

In [10]:
# Convert numeric columns to appropriate type
df['ai_confidence'] = pd.to_numeric(df['ai_confidence'], errors='coerce')
df['chatgpt_complex_topics'] = pd.to_numeric(df['chatgpt_complex_topics'], errors='coerce')


In [11]:
# Handle missing values
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna('Not specified', inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

In [12]:
# Create a list of AI tools used
df['ai_tools_list'] = df['top_ai_tools'].str.split(',').apply(lambda x: [tool.strip() for tool in x] if isinstance(x, list) else [])

# Create dummy variables for AI tools
ai_tools = df['ai_tools_list'].explode().unique()
for tool in ai_tools:
    df[f'uses_{tool}'] = df['ai_tools_list'].apply(lambda x: tool in x if isinstance(x, list) else False)


In [13]:
# Create dummy variables for AI use purposes
df['ai_use_purpose_list'] = df['ai_use_purpose'].str.split(',').apply(lambda x: [purpose.strip() for purpose in x] if isinstance(x, list) else [])
ai_purposes = df['ai_use_purpose_list'].explode().unique()
for purpose in ai_purposes:
    df[f'purpose_{purpose}'] = df['ai_use_purpose_list'].apply(lambda x: purpose in x if isinstance(x, list) else False)


In [14]:

# Print info about the cleaned dataset
print(df.info())
print("\nMissing values:\n", df.isnull().sum())
print("\nUnique values in categorical columns:")
for col in df.select_dtypes(include=['object']).columns:
    print(f"{col}: {df[col].nunique()}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 56 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   timestamp                             46 non-null     datetime64[ns]
 1   major                                 46 non-null     object        
 2   ai_use_freq                           46 non-null     object        
 3   top_ai_tools                          46 non-null     object        
 4   ai_use_purpose                        46 non-null     object        
 5   ai_satisfaction                       46 non-null     object        
 6   ai_challenges                         46 non-null     object        
 7   challenges_detail                     46 non-null     object        
 8   ai_academic_help                      46 non-null     object        
 9   ai_enhance_understanding              46 non-null     object        
 10  ai_r

TypeError: unhashable type: 'list'

# Feature Engineering


In [None]:

# Text Processing
tfidf = TfidfVectorizer(max_features = 10)
ai_putpose_tfidf = tfidf.fit_transform()