In [1]:
# Cannabis Strains Exploratory Data Analysis
# Author: nesarks
# Date: 2025-05-16

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import scipy.stats as stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import networkx as nx
from collections import Counter
import re
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set(font_scale=1.2)
colors = sns.color_palette('viridis', 3)

# Load the data
df = pd.read_csv('cannabis.csv')

In [2]:
# Function to create and save plots with consistent style
def save_plot(filename, dpi=300, bbox_inches='tight'):
    plt.savefig(f"plots/{filename}", dpi=dpi, bbox_inches=bbox_inches)
    plt.close()

import os
if not os.path.exists('plots'):
    os.makedirs('plots')

In [5]:
# 1. DATASET OVERVIEW
print("=" * 80)
print("1. DATASET OVERVIEW")
print("=" * 80)

# Basic summary statistics
print(f"Dataset shape: {df.shape}")
print(f"Number of unique strains: {df['Strain'].nunique()}")

# Check duplicates
duplicated = df[df.duplicated(subset=['Strain'], keep=False)]
print(f"Number of strains with duplicate entries: {len(duplicated['Strain'].unique())}")
if not duplicated.empty:
    print("Top duplicated strains:")
    print(duplicated['Strain'].value_counts())

# Data types and missing values
print("\nData types:")
print(df.dtypes)

print("\nMissing values:")
print(df.isnull().sum())

# Summary statistics
print("\nSummary statistics:")
print(df['Rating'].describe())

# Distribution of strain types
strain_types = df['Type'].value_counts()
print("\nDistribution of strain types:")
print(strain_types)

# Visualizing strain type distribution
plt.figure(figsize=(10, 6))
ax = sns.countplot(x='Type', data=df, palette=colors)
plt.title('Distribution of Cannabis Strain Types', fontsize=16)
plt.xlabel('Strain Type', fontsize=14)
plt.ylabel('Count', fontsize=14)

# Add count labels on top of bars
for i, count in enumerate(strain_types):
    ax.text(i, count + 10, str(count), ha='center', fontsize=12)

save_plot('strain_type_distribution.png')


1. DATASET OVERVIEW
Dataset shape: (2351, 6)
Number of unique strains: 2350
Number of strains with duplicate entries: 1
Top duplicated strains:
B-Witched    2
Name: Strain, dtype: int64

Data types:
Strain          object
Type            object
Rating         float64
Effects         object
Flavor          object
Description     object
dtype: object

Missing values:
Strain          0
Type            0
Rating          0
Effects         0
Flavor         46
Description    33
dtype: int64

Summary statistics:
count    2351.000000
mean        4.310634
std         0.836394
min         0.000000
25%         4.200000
50%         4.400000
75%         4.700000
max         5.000000
Name: Rating, dtype: float64

Distribution of strain types:
hybrid    1212
indica     699
sativa     440
Name: Type, dtype: int64


In [6]:
# 2. DISTRIBUTION ANALYSIS
print("\n" + "=" * 80)
print("2. DISTRIBUTION ANALYSIS")
print("=" * 80)

# Rating distribution
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='Rating', kde=True, bins=20)
plt.title('Distribution of Cannabis Strain Ratings', fontsize=16)
plt.xlabel('Rating', fontsize=14)
plt.ylabel('Count', fontsize=14)
save_plot('rating_distribution.png')

# Rating distribution by strain type
plt.figure(figsize=(14, 8))
sns.boxplot(x='Type', y='Rating', data=df, palette=colors)
plt.title('Rating Distribution by Strain Type', fontsize=16)
plt.xlabel('Strain Type', fontsize=14)
plt.ylabel('Rating', fontsize=14)
save_plot('rating_by_type_boxplot.png')

plt.figure(figsize=(14, 8))
sns.violinplot(x='Type', y='Rating', data=df, palette=colors, inner='quartile')
plt.title('Rating Violin Plot by Strain Type', fontsize=16)
plt.xlabel('Strain Type', fontsize=14)
plt.ylabel('Rating', fontsize=14)
save_plot('rating_by_type_violin.png')

# Function to extract and count effects and flavors
def extract_features(series):
    all_features = []
    for item in series.dropna():
        features = item.split(',')
        all_features.extend([feature.strip() for feature in features])
    return Counter(all_features)

# Extract effects and flavors
effects_counter = extract_features(df['Effects'])
flavors_counter = extract_features(df['Flavor'])

# Plot top effects
plt.figure(figsize=(14, 8))
top_effects = pd.DataFrame(effects_counter.most_common(10), columns=['Effect', 'Count'])
sns.barplot(x='Count', y='Effect', data=top_effects, palette='viridis')
plt.title('Top 10 Most Common Effects', fontsize=16)
plt.xlabel('Count', fontsize=14)
plt.ylabel('Effect', fontsize=14)
save_plot('top_effects.png')

# Plot top flavors
plt.figure(figsize=(14, 8))
top_flavors = pd.DataFrame(flavors_counter.most_common(10), columns=['Flavor', 'Count'])
sns.barplot(x='Count', y='Flavor', data=top_flavors, palette='viridis')
plt.title('Top 10 Most Common Flavors', fontsize=16)
plt.xlabel('Count', fontsize=14)
plt.ylabel('Flavor', fontsize=14)
save_plot('top_flavors.png')

# Create WordClouds for effects and flavors
plt.figure(figsize=(12, 10))
effects_text = ' '.join([effect for effect, count in effects_counter.items() for _ in range(count)])
wordcloud_effects = WordCloud(width=800, height=400, background_color='white', colormap='viridis', 
                             max_words=100, contour_width=3, contour_color='steelblue').generate(effects_text)
plt.imshow(wordcloud_effects, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Cannabis Effects', fontsize=16)
save_plot('effects_wordcloud.png')

plt.figure(figsize=(12, 10))
flavors_text = ' '.join([flavor for flavor, count in flavors_counter.items() for _ in range(count)])
wordcloud_flavors = WordCloud(width=800, height=400, background_color='white', colormap='viridis',
                             max_words=100, contour_width=3, contour_color='steelblue').generate(flavors_text)
plt.imshow(wordcloud_flavors, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Cannabis Flavors', fontsize=16)
save_plot('flavors_wordcloud.png')

# Count effects and flavors per strain
df['Effects_Count'] = df['Effects'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)
df['Flavor_Count'] = df['Flavor'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)

plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='Effects_Count', kde=True, bins=10, color=colors[0])
plt.title('Distribution of Effects Count per Strain', fontsize=16)
plt.xlabel('Number of Effects', fontsize=14)
plt.ylabel('Count', fontsize=14)
save_plot('effects_count_distribution.png')

plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='Flavor_Count', kde=True, bins=10, color=colors[1])
plt.title('Distribution of Flavors Count per Strain', fontsize=16)
plt.xlabel('Number of Flavors', fontsize=14)
plt.ylabel('Count', fontsize=14)
save_plot('flavors_count_distribution.png')

# Compare effects and flavors across strain types
# Function to get top effects/flavors by strain type
def get_top_by_type(feature_col, strain_type, top_n=5):
    strains = df[df['Type'] == strain_type][feature_col].dropna()
    features = []
    for item in strains:
        features.extend([f.strip() for f in item.split(',')])
    return Counter(features).most_common(top_n)

# Top effects by strain type
strain_types_list = df['Type'].unique()
plt.figure(figsize=(18, 10))
for i, strain_type in enumerate(strain_types_list):
    top_n_effects = get_top_by_type('Effects', strain_type, top_n=5)
    effects_df = pd.DataFrame(top_n_effects, columns=['Effect', 'Count'])
    
    plt.subplot(1, 3, i+1)
    sns.barplot(x='Count', y='Effect', data=effects_df, color=colors[i])
    plt.title(f'Top 5 Effects in {strain_type.capitalize()} Strains', fontsize=14)
    plt.xlabel('Count', fontsize=12)
    
    if i == 0:
        plt.ylabel('Effect', fontsize=12)
    else:
        plt.ylabel('')
        
plt.tight_layout()
save_plot('top_effects_by_type.png')

# Top flavors by strain type
plt.figure(figsize=(18, 10))
for i, strain_type in enumerate(strain_types_list):
    top_n_flavors = get_top_by_type('Flavor', strain_type, top_n=5)
    flavors_df = pd.DataFrame(top_n_flavors, columns=['Flavor', 'Count'])
    
    plt.subplot(1, 3, i+1)
    sns.barplot(x='Count', y='Flavor', data=flavors_df, color=colors[i])
    plt.title(f'Top 5 Flavors in {strain_type.capitalize()} Strains', fontsize=14)
    plt.xlabel('Count', fontsize=12)
    
    if i == 0:
        plt.ylabel('Flavor', fontsize=12)
    else:
        plt.ylabel('')
        
plt.tight_layout()
save_plot('top_flavors_by_type.png')


2. DISTRIBUTION ANALYSIS


In [7]:
# 3. RELATIONSHIP ANALYSIS
print("\n" + "=" * 80)
print("3. RELATIONSHIP ANALYSIS")
print("=" * 80)

# Create effect and flavor presence indicators (one-hot encoding)
def create_feature_columns(df, column_name):
    features = set()
    for items in df[column_name].dropna():
        for item in items.split(','):
            features.add(item.strip())
    
    for feature in features:
        col_name = f"{column_name}_{feature.replace(' ', '_')}"
        df[col_name] = df[column_name].apply(
            lambda x: 1 if isinstance(x, str) and feature in x else 0
        )
    return features

effects_set = create_feature_columns(df, 'Effects')
flavors_set = create_feature_columns(df, 'Flavor')

# Correlation between ratings and effects
effect_cols = [f"Effects_{effect.replace(' ', '_')}" for effect in effects_set]
effect_corr = df[['Rating'] + effect_cols].corr()['Rating'].drop('Rating').sort_values(ascending=False)

# Correlation between ratings and flavors
flavor_cols = [f"Flavor_{flavor.replace(' ', '_')}" for flavor in flavors_set]
flavor_corr = df[['Rating'] + flavor_cols].corr()['Rating'].drop('Rating').sort_values(ascending=False)

# Plot top correlations with ratings
plt.figure(figsize=(14, 10))
sns.barplot(x=effect_corr.iloc[:10].values, y=effect_corr.iloc[:10].index.str.replace('Effects_', '').str.replace('_', ' '))
plt.title('Top 10 Effects with Highest Correlation to Rating', fontsize=16)
plt.xlabel('Correlation Coefficient', fontsize=14)
plt.ylabel('Effect', fontsize=14)
save_plot('effect_rating_correlation.png')

plt.figure(figsize=(14, 10))
sns.barplot(x=effect_corr.iloc[-10:].values, y=effect_corr.iloc[-10:].index.str.replace('Effects_', '').str.replace('_', ' '))
plt.title('Bottom 10 Effects with Lowest Correlation to Rating', fontsize=16)
plt.xlabel('Correlation Coefficient', fontsize=14)
plt.ylabel('Effect', fontsize=14)
save_plot('effect_rating_correlation_bottom.png')

plt.figure(figsize=(14, 10))
sns.barplot(x=flavor_corr.iloc[:10].values, y=flavor_corr.iloc[:10].index.str.replace('Flavor_', '').str.replace('_', ' '))
plt.title('Top 10 Flavors with Highest Correlation to Rating', fontsize=16)
plt.xlabel('Correlation Coefficient', fontsize=14)
plt.ylabel('Flavor', fontsize=14)
save_plot('flavor_rating_correlation.png')

plt.figure(figsize=(14, 10))
sns.barplot(x=flavor_corr.iloc[-10:].values, y=flavor_corr.iloc[-10:].index.str.replace('Flavor_', '').str.replace('_', ' '))
plt.title('Bottom 10 Flavors with Lowest Correlation to Rating', fontsize=16)
plt.xlabel('Correlation Coefficient', fontsize=14)
plt.ylabel('Flavor', fontsize=14)
save_plot('flavor_rating_correlation_bottom.png')

# Calculate average rating for each effect and flavor
def avg_rating_by_feature(df, feature_column, feature_prefix):
    feature_ratings = []
    cols = [col for col in df.columns if col.startswith(feature_prefix)]
    
    for col in cols:
        feature_name = col.replace(feature_prefix, '').replace('_', ' ')
        avg_rating = df[df[col] == 1]['Rating'].mean()
        feature_ratings.append((feature_name, avg_rating, df[col].sum()))
    
    return pd.DataFrame(feature_ratings, columns=['Feature', 'Avg_Rating', 'Count'])

# Average rating by effect and flavor
effect_ratings = avg_rating_by_feature(df, 'Effects', 'Effects_')
flavor_ratings = avg_rating_by_feature(df, 'Flavor', 'Flavor_')

# Plot top and bottom effects by average rating
effect_ratings_sorted = effect_ratings.sort_values('Avg_Rating', ascending=False)
plt.figure(figsize=(14, 10))
sns.scatterplot(x='Avg_Rating', y='Feature', size='Count', data=effect_ratings_sorted.head(10), sizes=(100, 500))
plt.title('Top 10 Highest Rated Effects (with Count)', fontsize=16)
plt.xlabel('Average Rating', fontsize=14)
plt.ylabel('Effect', fontsize=14)
plt.xlim(4, 4.7)  # Adjust based on your data
save_plot('top_rated_effects.png')

plt.figure(figsize=(14, 10))
sns.scatterplot(x='Avg_Rating', y='Feature', size='Count', data=effect_ratings_sorted.tail(10), sizes=(100, 500))
plt.title('Bottom 10 Lowest Rated Effects (with Count)', fontsize=16)
plt.xlabel('Average Rating', fontsize=14)
plt.ylabel('Effect', fontsize=14)
plt.xlim(4, 4.7)  # Adjust based on your data
save_plot('bottom_rated_effects.png')

# Plot top and bottom flavors by average rating
flavor_ratings_sorted = flavor_ratings.sort_values('Avg_Rating', ascending=False)
plt.figure(figsize=(14, 10))
sns.scatterplot(x='Avg_Rating', y='Feature', size='Count', data=flavor_ratings_sorted.head(10), sizes=(100, 500))
plt.title('Top 10 Highest Rated Flavors (with Count)', fontsize=16)
plt.xlabel('Average Rating', fontsize=14)
plt.ylabel('Flavor', fontsize=14)
plt.xlim(4, 4.7)  # Adjust based on your data
save_plot('top_rated_flavors.png')

plt.figure(figsize=(14, 10))
sns.scatterplot(x='Avg_Rating', y='Feature', size='Count', data=flavor_ratings_sorted.tail(10), sizes=(100, 500))
plt.title('Bottom 10 Lowest Rated Flavors (with Count)', fontsize=16)
plt.xlabel('Average Rating', fontsize=14)
plt.ylabel('Flavor', fontsize=14)
plt.xlim(4, 4.7)  # Adjust based on your data
save_plot('bottom_rated_flavors.png')

# Create network graph for effects co-occurrence
def create_network_graph(df, feature_column, min_cooccurrence=50):
    G = nx.Graph()
    
    # Extract all unique features
    all_features = []
    for items in df[feature_column].dropna():
        features = [item.strip() for item in items.split(',')]
        all_features.append(features)
    
    # Count co-occurrences
    cooccurrence = {}
    for features in all_features:
        for i in range(len(features)):
            for j in range(i + 1, len(features)):
                pair = tuple(sorted([features[i], features[j]]))
                if pair in cooccurrence:
                    cooccurrence[pair] += 1
                else:
                    cooccurrence[pair] = 1
    
    # Add edges with sufficient co-occurrence
    for (feat1, feat2), count in cooccurrence.items():
        if count >= min_cooccurrence:
            G.add_edge(feat1, feat2, weight=count)
    
    return G, cooccurrence

# Create effect co-occurrence network
effect_network, effect_cooccurrence = create_network_graph(df, 'Effects', min_cooccurrence=100)

plt.figure(figsize=(16, 14))
pos = nx.spring_layout(effect_network, k=0.3, iterations=50)
edge_weights = [effect_network[u][v]['weight'] for u, v in effect_network.edges()]
max_weight = max(edge_weights)
norm_weights = [3 * w / max_weight for w in edge_weights]

nx.draw_networkx_nodes(effect_network, pos, node_color='skyblue', node_size=500, alpha=0.8)
nx.draw_networkx_edges(effect_network, pos, width=norm_weights, alpha=0.6, edge_color='gray')
nx.draw_networkx_labels(effect_network, pos, font_size=10)
plt.axis('off')
plt.title('Effect Co-occurrence Network (min 100 co-occurrences)', fontsize=16)
save_plot('effect_network.png')

# Create flavor co-occurrence network
flavor_network, flavor_cooccurrence = create_network_graph(df, 'Flavor', min_cooccurrence=40)

plt.figure(figsize=(16, 14))
pos = nx.spring_layout(flavor_network, k=0.3, iterations=50)
edge_weights = [flavor_network[u][v]['weight'] for u, v in flavor_network.edges()]
max_weight = max(edge_weights)
norm_weights = [3 * w / max_weight for w in edge_weights]

nx.draw_networkx_nodes(flavor_network, pos, node_color='lightgreen', node_size=500, alpha=0.8)
nx.draw_networkx_edges(flavor_network, pos, width=norm_weights, alpha=0.6, edge_color='gray')
nx.draw_networkx_labels(flavor_network, pos, font_size=10)
plt.axis('off')
plt.title('Flavor Co-occurrence Network (min 40 co-occurrences)', fontsize=16)
save_plot('flavor_network.png')

# Cross-relationship between effects and flavors
# Create effect-flavor correlation matrix
effect_flavor_corr = pd.DataFrame()
for effect_col in effect_cols:
    effect = effect_col.replace('Effects_', '').replace('_', ' ')
    for flavor_col in flavor_cols:
        flavor = flavor_col.replace('Flavor_', '').replace('_', ' ')
        corr = df[effect_col].corr(df[flavor_col])
        effect_flavor_corr.loc[effect, flavor] = corr

# Plot heatmap of top effect-flavor correlations
top_effects = effect_corr.index[:10].str.replace('Effects_', '').str.replace('_', ' ')
top_flavors = flavor_corr.index[:10].str.replace('Flavor_', '').str.replace('_', ' ')

plt.figure(figsize=(16, 12))
sns.heatmap(effect_flavor_corr.loc[top_effects, top_flavors], annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Between Top Effects and Flavors', fontsize=16)
plt.xlabel('Flavors', fontsize=14)
plt.ylabel('Effects', fontsize=14)
plt.xticks(rotation=45, ha='right')
save_plot('effect_flavor_correlation.png')


3. RELATIONSHIP ANALYSIS


In [8]:
# 4. STATISTICAL TESTING
print("\n" + "=" * 80)
print("4. STATISTICAL TESTING")
print("=" * 80)

# Test for differences in ratings between strain types
print("ANOVA Test for Rating Differences Across Strain Types:")
strain_groups = [df[df['Type'] == strain]['Rating'] for strain in strain_types_list]
f_stat, p_val = stats.f_oneway(*strain_groups)
print(f"F-statistic: {f_stat:.4f}")
print(f"p-value: {p_val:.4f}")
print(f"Significant differences: {p_val < 0.05}")

# Tukey's HSD post-hoc test
import statsmodels.stats.multicomp as mc
comp = mc.MultiComparison(df['Rating'], df['Type'])
post_hoc_res = comp.tukeyhsd()
print("\nTukey's HSD Test Results:")
print(post_hoc_res)

# Correlation between number of effects/flavors and ratings
corr_effects_count = df['Effects_Count'].corr(df['Rating'])
corr_flavors_count = df['Flavor_Count'].corr(df['Rating'])

print("\nCorrelation between number of effects and rating:")
print(f"Pearson r: {corr_effects_count:.4f}")
print(f"p-value: {stats.pearsonr(df['Effects_Count'], df['Rating'])[1]:.4f}")

print("\nCorrelation between number of flavors and rating:")
print(f"Pearson r: {corr_flavors_count:.4f}")
print(f"p-value: {stats.pearsonr(df['Flavor_Count'], df['Rating'])[1]:.4f}")

# Create scatter plots for these correlations
plt.figure(figsize=(12, 6))
sns.regplot(x='Effects_Count', y='Rating', data=df, scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
plt.title(f'Correlation Between Number of Effects and Rating (r={corr_effects_count:.4f})', fontsize=16)
plt.xlabel('Number of Effects', fontsize=14)
plt.ylabel('Rating', fontsize=14)
save_plot('effects_count_rating_correlation.png')

plt.figure(figsize=(12, 6))
sns.regplot(x='Flavor_Count', y='Rating', data=df, scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
plt.title(f'Correlation Between Number of Flavors and Rating (r={corr_flavors_count:.4f})', fontsize=16)
plt.xlabel('Number of Flavors', fontsize=14)
plt.ylabel('Rating', fontsize=14)
save_plot('flavor_count_rating_correlation.png')


4. STATISTICAL TESTING
ANOVA Test for Rating Differences Across Strain Types:
F-statistic: 1.0155
p-value: 0.3624
Significant differences: False

Tukey's HSD Test Results:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
hybrid indica   0.0561 0.3345  -0.037 0.1493  False
hybrid sativa   0.0122 0.9629  -0.097 0.1214  False
indica sativa  -0.0439 0.6638 -0.1633 0.0754  False
---------------------------------------------------

Correlation between number of effects and rating:
Pearson r: 0.6080
p-value: 0.0000

Correlation between number of flavors and rating:
Pearson r: 0.4341
p-value: 0.0000


In [9]:
# 5. ADVANCED ANALYSIS
print("\n" + "=" * 80)
print("5. ADVANCED ANALYSIS")
print("=" * 80)

# Prepare data for PCA and clustering
# Get all one-hot encoded columns
feature_cols = effect_cols + flavor_cols
X = df[feature_cols].values

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA for dimensionality reduction
pca = PCA(n_components=10)  # Reduce to 10 dimensions
X_pca = pca.fit_transform(X_scaled)

# Print explained variance
explained_variance = pca.explained_variance_ratio_
print("PCA Explained Variance:")
print(explained_variance)
print(f"Total explained variance with 10 components: {sum(explained_variance):.4f}")

# Plot explained variance
plt.figure(figsize=(12, 6))
plt.plot(range(1, len(explained_variance) + 1), np.cumsum(explained_variance), marker='o')
plt.axhline(y=0.7, color='r', linestyle='--')
plt.title('Cumulative Explained Variance by PCA Components', fontsize=16)
plt.xlabel('Number of Components', fontsize=14)
plt.ylabel('Cumulative Explained Variance', fontsize=14)
plt.grid(True)
save_plot('pca_explained_variance.png')

# First two principal components and their loadings
pca_df = pd.DataFrame(
    data=pca.components_[:2],
    columns=feature_cols
)

plt.figure(figsize=(20, 10))
sns.heatmap(pca_df, cmap='coolwarm', center=0)
plt.title('PCA Component Loadings for Top 2 Components', fontsize=16)
plt.xlabel('Features', fontsize=14)
plt.ylabel('Principal Components', fontsize=14)
plt.xticks(rotation=90)
save_plot('pca_component_loadings.png')

# K-means clustering
# Determine optimal number of clusters using the elbow method
inertia = []
k_range = range(2, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_pca)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(12, 6))
plt.plot(k_range, inertia, marker='o')
plt.title('Elbow Method for Optimal k', fontsize=16)
plt.xlabel('Number of Clusters (k)', fontsize=14)
plt.ylabel('Inertia', fontsize=14)
plt.grid(True)
save_plot('kmeans_elbow.png')

# Apply K-means with the optimal number of clusters (let's choose 4 for this example)
n_clusters = 4  # Choose based on elbow method
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_pca)

# t-SNE for visualization
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_pca)

# Create dataframe for visualization
tsne_df = pd.DataFrame({
    'x': X_tsne[:, 0],
    'y': X_tsne[:, 1],
    'Strain': df['Strain'],
    'Type': df['Type'],
    'Rating': df['Rating'],
    'Cluster': df['Cluster']
})

# Visualize clusters
plt.figure(figsize=(14, 10))
sns.scatterplot(x='x', y='y', hue='Cluster', data=tsne_df, palette='viridis', alpha=0.7)
plt.title('t-SNE Visualization of Cannabis Strain Clusters', fontsize=16)
plt.xlabel('t-SNE Component 1', fontsize=14)
plt.ylabel('t-SNE Component 2', fontsize=14)
save_plot('tsne_clusters.png')

# Visualize clusters with original strain types
plt.figure(figsize=(14, 10))
sns.scatterplot(x='x', y='y', hue='Type', data=tsne_df, palette='Set2', alpha=0.7)
plt.title('t-SNE Visualization of Cannabis Strain Types', fontsize=16)
plt.xlabel('t-SNE Component 1', fontsize=14)
plt.ylabel('t-SNE Component 2', fontsize=14)
save_plot('tsne_strain_types.png')

# Analyze cluster characteristics
cluster_profile = pd.DataFrame()
for i in range(n_clusters):
    cluster_data = df[df['Cluster'] == i]
    cluster_profile[f"Cluster {i}"] = [
        len(cluster_data),
        cluster_data['Rating'].mean(),
        cluster_data['Type'].value_counts().idxmax(),
        cluster_data[cluster_data['Type'] == 'indica'].shape[0] / len(cluster_data) if len(cluster_data) > 0 else 0,
        cluster_data[cluster_data['Type'] == 'sativa'].shape[0] / len(cluster_data) if len(cluster_data) > 0 else 0,
        cluster_data[cluster_data['Type'] == 'hybrid'].shape[0] / len(cluster_data) if len(cluster_data) > 0 else 0,
        cluster_data['Effects_Count'].mean(),
        cluster_data['Flavor_Count'].mean()
    ]

cluster_profile.index = ['Size', 'Avg Rating', 'Dominant Type', 'Indica %', 'Sativa %', 'Hybrid %', 'Avg Effects', 'Avg Flavors']
print("\nCluster Profiles:")
print(cluster_profile)

# Top effects and flavors in each cluster
def top_features_by_cluster(df, cluster_id, feature_prefix, top_n=5):
    cluster_df = df[df['Cluster'] == cluster_id]
    cols = [col for col in df.columns if col.startswith(feature_prefix)]
    totals = cluster_df[cols].sum().sort_values(ascending=False)
    return totals.head(top_n)

# Plot top effects for each cluster
plt.figure(figsize=(20, 12))
for i in range(n_clusters):
    top_effects = top_features_by_cluster(df, i, 'Effects_', top_n=5)
    
    plt.subplot(2, 2, i+1)
    top_effects.plot(kind='bar')
    plt.title(f'Top 5 Effects in Cluster {i}', fontsize=14)
    plt.ylabel('Count', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    
save_plot('cluster_top_effects.png')

# Plot top flavors for each cluster
plt.figure(figsize=(20, 12))
for i in range(n_clusters):
    top_flavors = top_features_by_cluster(df, i, 'Flavor_', top_n=5)
    
    plt.subplot(2, 2, i+1)
    top_flavors.plot(kind='bar')
    plt.title(f'Top 5 Flavors in Cluster {i}', fontsize=14)
    plt.ylabel('Count', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    
save_plot('cluster_top_flavors.png')


5. ADVANCED ANALYSIS
PCA Explained Variance:
[0.04186276 0.03727484 0.03456102 0.03362024 0.03055141 0.02886251
 0.02503676 0.02237808 0.02098946 0.02069704]
Total explained variance with 10 components: 0.2958

Cluster Profiles:
              Cluster 0 Cluster 1 Cluster 2 Cluster 3
Size               1223         1      1012       115
Avg Rating     4.426083       4.0  4.427569  2.056522
Dominant Type    hybrid    hybrid    hybrid    hybrid
Indica %       0.466885       0.0  0.099802  0.234783
Sativa %       0.056419       0.0  0.341897  0.217391
Hybrid %       0.476697       1.0    0.5583  0.547826
Avg Effects    4.925593       2.0   4.97332   1.53913
Avg Flavors    2.916599       2.0  2.914032   0.93913


In [10]:
# 6. INSIGHT SUMMARY
print("\n" + "=" * 80)
print("6. INSIGHT SUMMARY")
print("=" * 80)

# Rating distribution summary
print(f"Average cannabis strain rating: {df['Rating'].mean():.2f}")
print(f"Minimum rating: {df['Rating'].min():.2f}")
print(f"Maximum rating: {df['Rating'].max():.2f}")

# Effect and flavor counts
print(f"\nNumber of unique effects: {len(effects_set)}")
print(f"Number of unique flavors: {len(flavors_set)}")

# Most common effects and flavors
print("\nTop 5 most common effects:")
for effect, count in effects_counter.most_common(5):
    print(f"- {effect}: {count}")

print("\nTop 5 most common flavors:")
for flavor, count in flavors_counter.most_common(5):
    print(f"- {flavor}: {count}")

# Average ratings by strain type
print("\nAverage ratings by strain type:")
for strain_type in strain_types_list:
    avg_rating = df[df['Type'] == strain_type]['Rating'].mean()
    print(f"- {strain_type}: {avg_rating:.2f}")

# Effects and flavors with highest correlations to high ratings
print("\nEffects most associated with high ratings:")
for effect, corr in effect_corr.head(5).items():
    effect_name = effect.replace('Effects_', '').replace('_', ' ')
    print(f"- {effect_name}: {corr:.4f}")

print("\nFlavors most associated with high ratings:")
for flavor, corr in flavor_corr.head(5).items():
    flavor_name = flavor.replace('Flavor_', '').replace('_', ' ')
    print(f"- {flavor_name}: {corr:.4f}")

# Create a summary dashboard figure
plt.figure(figsize=(20, 20))

# Plot 1: Distribution of strain types
plt.subplot(3, 2, 1)
ax = sns.countplot(x='Type', data=df, palette=colors)
for i, count in enumerate(strain_types):
    ax.text(i, count + 10, str(count), ha='center')
plt.title('Distribution of Cannabis Strain Types', fontsize=14)
plt.xlabel('Strain Type', fontsize=12)
plt.ylabel('Count', fontsize=12)

# Plot 2: Rating distribution
plt.subplot(3, 2, 2)
sns.boxplot(x='Type', y='Rating', data=df, palette=colors)
plt.title('Rating Distribution by Strain Type', fontsize=14)
plt.xlabel('Strain Type', fontsize=12)
plt.ylabel('Rating', fontsize=12)

# Plot 3: Top effects
plt.subplot(3, 2, 3)
top5_effects = pd.DataFrame(effects_counter.most_common(5), columns=['Effect', 'Count'])
sns.barplot(x='Count', y='Effect', data=top5_effects, palette='viridis')
plt.title('Top 5 Most Common Effects', fontsize=14)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Effect', fontsize=12)

# Plot 4: Top flavors
plt.subplot(3, 2, 4)
top5_flavors = pd.DataFrame(flavors_counter.most_common(5), columns=['Flavor', 'Count'])
sns.barplot(x='Count', y='Flavor', data=top5_flavors, palette='viridis')
plt.title('Top 5 Most Common Flavors', fontsize=14)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Flavor', fontsize=12)

# Plot 5: Clusters
plt.subplot(3, 2, 5)
sns.scatterplot(x='x', y='y', hue='Cluster', data=tsne_df, palette='viridis', alpha=0.7)
plt.title('Cannabis Strain Clusters', fontsize=14)
plt.xlabel('t-SNE Component 1', fontsize=12)
plt.ylabel('t-SNE Component 2', fontsize=12)

# Plot 6: Effect-rating correlation
plt.subplot(3, 2, 6)
effect_corr_top5 = effect_corr.head(5)
sns.barplot(x=effect_corr_top5.values, y=effect_corr_top5.index.str.replace('Effects_', '').str.replace('_', ' '))
plt.title('Top 5 Effects with Highest Rating Correlation', fontsize=14)
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('Effect', fontsize=12)

plt.tight_layout()
save_plot('summary_dashboard.png')

print("\nAnalysis complete! All visualizations have been saved in the 'plots' folder.")


6. INSIGHT SUMMARY
Average cannabis strain rating: 4.31
Minimum rating: 0.00
Maximum rating: 5.00

Number of unique effects: 16
Number of unique flavors: 50

Top 5 most common effects:
- Happy: 1871
- Relaxed: 1726
- Euphoric: 1635
- Uplifted: 1507
- Creative: 747

Top 5 most common flavors:
- Earthy: 1105
- Sweet: 1053
- Citrus: 527
- Pungent: 451
- Berry: 355

Average ratings by strain type:
- hybrid: 4.29
- sativa: 4.30
- indica: 4.35

Effects most associated with high ratings:
- Happy: 0.2914
- Relaxed: 0.2550
- Euphoric: 0.2252
- Uplifted: 0.1962
- Creative: 0.1018

Flavors most associated with high ratings:
- Sweet: 0.1445
- Earthy: 0.1195
- Berry: 0.0816
- Citrus: 0.0799
- Pine: 0.0661

Analysis complete! All visualizations have been saved in the 'plots' folder.
