In [1]:
import pandas as pd

# Load the data
df = pd.read_csv('Hackathon round 3 with demos[48].csv')

# Quick exploration
print(f"Total responses: {len(df)}")
print(f"Columns: {len(df.columns)}")

# Find open-ended responses
oe_columns = [col for col in df.columns if '_OE' in col]
print(f"Open-ended questions: {len(oe_columns)}")

# Look at sentiment distribution
sentiment_cols = [col for col in df.columns if 'sentiment' in col and 'percentage' in col]
for col in sentiment_cols[:3]:
    print(f"\n{col}:")
    print(df[col].describe())

Total responses: 1001
Columns: 124
Open-ended questions: 48

Q2_Experience_with_AI_animal_other_OE_sentiment_percentage:
count    112.000000
mean       0.837954
std        0.187040
min        0.339674
25%        0.692079
50%        0.937590
75%        0.996153
max        0.999776
Name: Q2_Experience_with_AI_animal_other_OE_sentiment_percentage, dtype: float64

Q4A_Sector_AI_making_a_positive_impact_other_OE_sentiment_percentage:
count    81.000000
mean      0.879611
std       0.164659
min       0.458046
25%       0.769937
50%       0.984473
75%       0.995199
max       0.998966
Name: Q4A_Sector_AI_making_a_positive_impact_other_OE_sentiment_percentage, dtype: float64

Q4B_Sector_AI_making_a_negative_impact_other_OE_sentiment_percentage:
count    81.000000
mean      0.898728
std       0.132974
min       0.465820
25%       0.825016
50%       0.964309
75%       0.994571
max       0.999315
Name: Q4B_Sector_AI_making_a_negative_impact_other_OE_sentiment_percentage, dtype: float64


In [2]:
#!/usr/bin/env python3
"""
BC AI Survey Data Explorer - Hackathon Starter Script
=====================================================

A friendly introduction to the BC AI Survey dataset.
Perfect for getting started with data exploration!

Usage:
    python explore-data-starter.py

Requirements:
    pip install pandas

"""

import pandas as pd
import sys
from pathlib import Path

def explore_bc_ai_survey():
    """Explore the BC AI Survey data with helpful guidance"""
    
    # Find the data file
    data_file = "Hackathon round 3 with demos[48].csv"
    if not Path(data_file).exists():
        print(f"■ Data file not found: {data_file}")
        print("Make sure you're running this script from the repository root directory.")
        return
    
    print("■ BC AI Survey Data Explorer")
    print("=" * 50)
    
    try:
        # Load the data
        print("■ Loading survey data...")
        df = pd.read_csv(data_file)
        print(f"■ Loaded {len(df)} responses with {len(df.columns)} columns")
        print()
        
        # Basic overview
        print("■ Dataset Overview:")
        print(f"   • Total responses: {len(df):,}")
        print(f"   • Total columns: {len(df.columns):,}")
        print(f"   • Response rate: ~{len(df)} out of 1,001 target")
        print()
        
        # Find key column types
        oe_columns = [col for col in df.columns if '_OE' in col and 'sentiment' not in col]
        sentiment_columns = [col for col in df.columns if 'sentiment_percentage' in col]
        demo_columns = ['AgeRollup_Broad', 'Q1_Location_in_BC', 'Q1_Experience_with_AI']
        
        print("■ Column Categories:")
        print(f"   • Open-ended responses (_OE): {len(oe_columns)}")
        print(f"   • Sentiment scores: {len(sentiment_columns)}")
        print(f"   • Demographic fields: {len(demo_columns)}")
        print()
        
        # Demographics snapshot
        print("■ Demographics Snapshot:")
        if 'AgeRollup_Broad' in df.columns:
            age_dist = df['AgeRollup_Broad'].value_counts()
            for age, count in age_dist.items():
                pct = (count / len(df)) * 100
                print(f"   • {age}: {count} ({pct:.1f}%)")
        print()
        
        if 'Q1_Location_in_BC' in df.columns:
            print("■ Geographic Distribution:")
            location_dist = df['Q1_Location_in_BC'].value_counts().head(5)
            for location, count in location_dist.items():
                pct = (count / len(df)) * 100
                print(f"   • {location}: {count} ({pct:.1f}%)")
        print()
        
        # AI Experience levels
        if 'Q1_Experience_with_AI' in df.columns:
            print("■ AI Experience Levels:")
            exp_dist = df['Q1_Experience_with_AI'].value_counts()
            for exp, count in exp_dist.items():
                pct = (count / len(df)) * 100
                print(f"   • {exp}: {count} ({pct:.1f}%)")
        print()
        
        # Sample some interesting quotes
        print("■ Sample Quotes (Open-Ended Responses):")
        print("   (These are the storytelling goldmines!)")
        print()
        
        # Q17 Advice to leaders
        if 'Q17_Advice_BC_Leaders_text_OE' in df.columns:
            q17_responses = df['Q17_Advice_BC_Leaders_text_OE'].dropna()
            print("   ■ Advice to BC Leaders (Q17):")
            for i, response in enumerate(q17_responses.head(3)):
                if len(response.strip()) > 10:
                    preview = response[:80] + "..." if len(response) > 80 else response
                    print(f"      {i+1}. \"{preview}\"")
            print(f"      ... and {len(q17_responses)-3:,} more responses!")
            print()
        
        # Sentiment patterns
        print("■ Sentiment Patterns:")
        sample_sentiment_col = [col for col in sentiment_columns if 'Q17' in col]
        if sample_sentiment_col:
            col = sample_sentiment_col[0]
            sentiment_scores = pd.to_numeric(df[col], errors='coerce').dropna()
            avg_sentiment = sentiment_scores.mean()
            print(f"   • Average sentiment (Q17): {avg_sentiment:.2f} (0=negative, 1=positive)")
            
            # Find extreme sentiments
            very_negative = sentiment_scores[sentiment_scores < 0.1]
            very_positive = sentiment_scores[sentiment_scores > 0.9]
            print(f"   • Very negative responses (<0.1): {len(very_negative)}")
            print(f"   • Very positive responses (>0.9): {len(very_positive)}")
        print()
        
        # Next steps guidance
        print("■ Ready to Dive Deeper?")
        print()
        print("   ■ Hot Tips for Exploration:")
        print("   1. Focus on open-ended columns (*_OE) for authentic quotes")
        print("   2. Cross-reference sentiment with demographics")
        print("   3. Look for patterns in AI experience vs attitudes")
        print("   4. Hunt for geographic differences (Vancouver vs rural)")
        print("   5. Find the extreme voices (99%+ positive/negative sentiment)")
        print()
        
        print("   ■ Suggested Next Steps:")
        print("   • Load data: df = pd.read_csv('Hackathon round 3 with demos[48].csv')")
        print("   • Explore quotes: df['Q17_Advice_BC_Leaders_text_OE'].dropna()")
        print("   • Check sentiment: df['Q17_Advice_BC_Leaders_text_OE_sentiment_percentage']")
        print("   • Filter by demographics: df[df['AgeRollup_Broad'] == '18-34']")
        print("   • Find patterns: df.groupby('Q1_Experience_with_AI')['sentiment_col'].mean()")
        print()
        
        print("■ Happy Data Storytelling!")
        print("   Remember: Every row is a real British Columbian's voice.")
        print("   Your job is to help those voices be heard!")
        
    except Exception as e:
        print(f"■ Error loading data: {e}")
        print("Make sure pandas is installed: pip install pandas")

if __name__ == "__main__":
    explore_bc_ai_survey()

■ BC AI Survey Data Explorer
■ Loading survey data...
■ Loaded 1001 responses with 124 columns

■ Dataset Overview:
   • Total responses: 1,001
   • Total columns: 124
   • Response rate: ~1001 out of 1,001 target

■ Column Categories:
   • Open-ended responses (_OE): 16
   • Sentiment scores: 16
   • Demographic fields: 3

■ Demographics Snapshot:
   • 55 Plus: 423 (42.3%)
   • 35-54: 348 (34.8%)
   • 18-34: 230 (23.0%)

■ Geographic Distribution:
   • Vancouver / Lower Mainland: 767 (76.6%)
   • Victoria: 137 (13.7%)
   • Kelowna: 67 (6.7%)
   • Prince George: 30 (3.0%)

■ AI Experience Levels:
   • Occasional user – I’ve tried a few tools like ChatGPT or an AI art app.: 405 (40.5%)
   • Aware but not a user – I’ve heard about AI but haven’t used it myself.: 316 (31.6%)
   • Regular user – I use AI often for work or personal stuff.: 164 (16.4%)
   • No experience – I haven’t really used AI and don’t know much about it.: 106 (10.6%)
   • Expert/Developer – I work with or build AI tool

In [3]:
df = pd.read_csv('Hackathon round 3 with demos[48].csv')

In [4]:
df['Q17_Advice_BC_Leaders_text_OE'].dropna()

0                                  People need to survive
1       Expand the thinking and possibilities of artif...
2       Focus on ethical use and promotion of AI. It's...
3                            Don't let it take jobs away 
4       My ideal would be to disband the NDP, but as t...
                              ...                        
996                       encourage use in Health matters
997                                    We don’t need AI. 
998                         Carefully monitor and control
999     Since I do not know anything about it , I’m in...
1000    Keep bc working. Rejig elections canada. There...
Name: Q17_Advice_BC_Leaders_text_OE, Length: 993, dtype: object

In [5]:
import pandas as pd
from collections import Counter

def count_words_fast(text):
    text = text.lower()
    skips = [".", ",", ":", ";", "'", '"']
    for ch in skips:
        text = text.replace(ch, "")
    word_counts = Counter(text.split())
    return word_counts

# Combine all responses into one large string
all_text = " ".join(df['Q17_Advice_BC_Leaders_text_OE'].dropna().tolist())

# Count word frequency
word_freq = count_words_fast(all_text)

# Convert to a sorted DataFrame for display
word_freq_df = pd.DataFrame(word_freq.items(), columns=['Word', 'Frequency'])
word_freq_df = word_freq_df.sort_values(by='Frequency', ascending=False).reset_index(drop=True)

# Display top 20 most common words
print(word_freq_df.head(50))

           Word  Frequency
0           and        503
1            to        446
2           the        439
3            it        394
4            ai        286
5            be        244
6            of        241
7           not        168
8           for        159
9            in        158
10           is        148
11            a        147
12         make        125
13           on        115
14         that        113
15          use        113
16       people        110
17         dont        104
18         with         97
19           do         79
20         sure         76
21         have         72
22          are         71
23          all         68
24        don’t         67
25           as         67
26           we         65
27          its         65
28            i         62
29         this         59
30       should         57
31           or         57
32         will         55
33          let         55
34         from         50
35          you         48
3

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_df = word_freq_df[~word_freq_df['Word'].isin(stop_words)]

# Xem top từ mang ý nghĩa
print(filtered_df.head(20))

           Word  Frequency
4            ai        286
12         make        125
15          use        113
16       people        110
17         dont        104
20         sure         76
24        don’t         67
33          let         55
39         keep         44
41         need         44
42         take         43
45          get         41
46  regulations         41
47        think         40
49     everyone         39
50     regulate         39
51         jobs         38
55       public         36
59         good         35
61         know         32


In [8]:
from collections import Counter
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.strip()

print("■ Word Frequency by Age Group (Q17 Responses)")
print("=" * 50)

if 'Q17_Advice_BC_Leaders_text_OE' in df.columns and 'AgeRollup_Broad' in df.columns:
    age_groups = df['AgeRollup_Broad'].dropna().unique()
    
    for age_group in age_groups:
        print(f"\n Age Group: {age_group}")
        subset = df[(df['AgeRollup_Broad'] == age_group) & (df['Q17_Advice_BC_Leaders_text_OE'].notna())]
        texts = subset['Q17_Advice_BC_Leaders_text_OE'].apply(clean_text)
        all_text = " ".join(texts)
        word_freq = Counter(all_text.split())
        
        # Loại bỏ từ dừng
        filtered = {w: c for w, c in word_freq.items() if w not in stop_words}
        top_words = sorted(filtered.items(), key=lambda x: x[1], reverse=True)[:10]
        
        for word, count in top_words:
            print(f"   • {word}: {count}")


■ Word Frequency by Age Group (Q17 Responses)

 Age Group: 18-34
   • ai: 81
   • dont: 47
   • make: 35
   • people: 32
   • use: 32
   • sure: 14
   • let: 13
   • regulate: 13
   • good: 13
   • stop: 12

 Age Group: 55 Plus
   • ai: 103
   • dont: 55
   • make: 42
   • people: 39
   • use: 37
   • sure: 30
   • think: 20
   • get: 20
   • keep: 20
   • careful: 19

 Age Group: 35-54
   • ai: 102
   • dont: 69
   • make: 48
   • use: 46
   • people: 39
   • sure: 33
   • let: 25
   • jobs: 21
   • regulations: 21
   • take: 20


In [9]:
print("■ Word Frequency by Region (Q17 Responses)")
print("=" * 50)

from collections import Counter
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.strip()

# Lấy top 5 vùng xuất hiện nhiều nhất
top_regions = df['Q1_Location_in_BC'].value_counts().head(5).index.tolist()

for region in top_regions:
    print(f"\n Region: {region}")
    subset = df[(df['Q1_Location_in_BC'] == region) & (df['Q17_Advice_BC_Leaders_text_OE'].notna())]
    texts = subset['Q17_Advice_BC_Leaders_text_OE'].apply(clean_text)
    all_text = " ".join(texts)
    word_freq = Counter(all_text.split())

    # Lọc stopwords
    filtered = {w: c for w, c in word_freq.items() if w not in stop_words}
    top_words = sorted(filtered.items(), key=lambda x: x[1], reverse=True)[:10]

    for word, count in top_words:
        print(f"   • {word}: {count}")


■ Word Frequency by Region (Q17 Responses)

 Region: Vancouver / Lower Mainland
   • ai: 213
   • dont: 136
   • make: 95
   • people: 89
   • use: 85
   • sure: 55
   • let: 48
   • keep: 35
   • take: 33
   • think: 33

 Region: Victoria
   • ai: 51
   • dont: 27
   • make: 18
   • use: 17
   • sure: 14
   • people: 13
   • regulate: 9
   • get: 7
   • everyone: 7
   • accessible: 6

 Region: Kelowna
   • make: 10
   • ai: 9
   • sure: 8
   • use: 7
   • need: 6
   • dont: 5
   • monitor: 4
   • public: 4
   • ethical: 4
   • everyone: 4

 Region: Prince George
   • ai: 13
   • use: 6
   • people: 6
   • get: 5
   • find: 5
   • human: 4
   • everyone: 4
   • good: 3
   • public: 3
   • dont: 3


In [10]:
print(df['Q17_Advice_BC_Leaders_text_OE_sentiment_percentage'].head())

print(df['Q17_Advice_BC_Leaders_text_OE_sentiment_percentage'].describe())

0    0.553198
1    0.570897
2    0.888593
3    0.524501
4    0.456881
Name: Q17_Advice_BC_Leaders_text_OE_sentiment_percentage, dtype: float64
count    997.000000
mean       0.713045
std        0.182151
min        0.288424
25%        0.560119
50%        0.705587
75%        0.877080
max        0.999290
Name: Q17_Advice_BC_Leaders_text_OE_sentiment_percentage, dtype: float64


In [11]:
age_group = '18-34'
subset = df[df['AgeRollup_Broad'] == age_group]
sentiments = pd.to_numeric(subset['Q17_Advice_BC_Leaders_text_OE_sentiment_percentage'], errors='coerce').dropna()

print(f" Average sentiment for age {age_group}: {sentiments.mean():.3f}")

 Average sentiment for age 18-34: 0.713


In [12]:
df['Q17_Advice_BC_Leaders_text_OE_sentiment_percentage'] = pd.to_numeric(
    df['Q17_Advice_BC_Leaders_text_OE_sentiment_percentage'], errors='coerce'
)

sentiment_by_ai_exp = df.groupby('Q1_Experience_with_AI')['Q17_Advice_BC_Leaders_text_OE_sentiment_percentage'].mean().sort_values(ascending=False)

print(" Average sentiment by AI experience level:")
print(sentiment_by_ai_exp)

 Average sentiment by AI experience level:
Q1_Experience_with_AI
Regular user – I use AI often for work or personal stuff.                  0.735705
Aware but not a user – I’ve heard about AI but haven’t used it myself.     0.717028
No experience – I haven’t really used AI and don’t know much about it.     0.711189
Occasional user – I’ve tried a few tools like ChatGPT or an AI art app.    0.703087
Expert/Developer – I work with or build AI tools.                          0.643874
Name: Q17_Advice_BC_Leaders_text_OE_sentiment_percentage, dtype: float64


In [13]:
very_positive = df[df['Q17_Advice_BC_Leaders_text_OE_sentiment_percentage'] > 0.9]
very_negative = df[df['Q17_Advice_BC_Leaders_text_OE_sentiment_percentage'] < 0.1]

In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

text_data = df['Q17_Advice_BC_Leaders_text_OE'].dropna().astype(str)

tfidf = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=3, ngram_range=(1,2))
tfidf_matrix = tfidf.fit_transform(text_data)

num_clusters = 5 
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init='auto')
kmeans.fit(tfidf_matrix)


df_clustered = text_data.to_frame(name='Response')
df_clustered['Cluster'] = kmeans.labels_


for i in range(num_clusters):
    print(f"\n Cluster {i}")
    sample = df_clustered[df_clustered['Cluster'] == i]['Response'].sample(3, random_state=42)
    for resp in sample:
        print(f"   - {resp[:100]}...")


 Cluster 0
   - I dont think AI is a good thing when it takes away peoples ability to think for themselves ...
   - AI should not be the driver of decisions or source of information. Critical thinking needs to be pro...
   - Force all use of ai to be public and transparent ...

 Cluster 1
   - Don't use it to line their pockets and their friends with money...
   - Let it be a tool to be used. Don’t let it take the place of humans...
   - don't hope AI can do all the job...

 Cluster 2
   - Not sure...
   - Be vigilant and do your own research....
   - No thoughts ...

 Cluster 3
   - Please don't let this add to our province's energy consumption. If we are going to have data centers...
   - Make it accessible and barrier-free for everyone...
   - Make it free...

 Cluster 4
   - Take it slow. ...
   - Go very, very slow....
   - Take it slow ...
