### Setup and Feature Selection
Import data, convert text data to numerical data, and select features (can change chosen features in this block).

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

df = pd.read_csv('masculinity.csv')

# drop columns irrelevant to analysis
df = df.iloc[:,3:]

features = df.copy()

#TODO
# what to do with "no answer"
# convert floats to ints?
# nothing obvious to do with q0009
# q0010 only applies to employed
# q0025 is about children, probably can do something with that
# q0026 is sexual orientation

#In general, how masculine or “manly” do you feel?
features['q0001'] = features['q0001'].map({'Not at all masculine' : 0, 'Not very masculine': 1, 'Somewhat masculine': 2, 'Very masculine': 3})

# How important is it to you that others see you as masculine?
features['q0002'] = features['q0002'].map({'Not at all important' : 0, 'Not too important': 1, 'Somewhat important': 2, 'Very important': 3})

# Where have you gotten your ideas about what it means to be a good man?
q4_cols = [col for col in features.columns if col.startswith('q0004')]
for col in q4_cols:
    features[col] = features[col].apply(lambda x: 0 if x == 'Not selected' else 1)

# Do you think that society puts pressure on men in a way that is unhealthy or bad for them?
features['q0005'] = features['q0005'].apply(lambda x: 1 if x == 'Yes' else 0)

# How often would you do a variety of things? (masculine and not)
q7_cols = [col for col in features.columns if col.startswith('q0007')]
for col in q7_cols:
    features[col] = features[col].map({'Never, and not open to it' : 0, 'Never, but open to it' : 1, 'Rarely' : 2, 'Sometimes' : 3, 'Often' : 4})

# Which of the following do you worry about on a daily or near daily basis? (masculine and not)
q8_cols = [col for col in features.columns if col.startswith('q0008')]
for col in q8_cols:
    features[col] = features[col].apply(lambda x: 0 if x == 'Not selected' else 1)

# drop all that pertain to work or are shown to subset groups q9 - q16
cols_to_drop = ['q0009', 'q0010_0001', 'q0010_0002',
       'q0010_0003', 'q0010_0004', 'q0010_0005', 'q0010_0006', 'q0010_0007',
       'q0010_0008', 'q0011_0001', 'q0011_0002', 'q0011_0003', 'q0011_0004',
       'q0011_0005', 'q0012_0001', 'q0012_0002', 'q0012_0003', 'q0012_0004',
       'q0012_0005', 'q0012_0006', 'q0012_0007', 'q0013', 'q0014', 'q0015']

features = features.drop(columns=cols_to_drop)

#Do you typically feel as though you’re expected to make the first move in romantic relationships?
features['q0017'] = features['q0017'].apply(lambda x: 1 if x == 'Yes' else 0)

# How often do you try to be the one who pays when on a date?
features['q0018'] = features['q0018'].map({'Never' : 0, 'Rarely': 1, 'Sometimes': 2, 'Often': 3, 'Always' : 4})

# drop q19, q20, q21
cols_to_drop2 = ['q0019_0001', 'q0019_0002', 'q0019_0003',
       'q0019_0004', 'q0019_0005', 'q0019_0006', 'q0019_0007', 'q0020_0001',
       'q0020_0002', 'q0020_0003', 'q0020_0004', 'q0020_0005', 'q0020_0006',
       'q0021_0001', 'q0021_0002', 'q0021_0003', 'q0021_0004']

features = features.drop(columns=cols_to_drop2)

# Have you changed your behavior in romantic relationships in the wake of #MeToo movement?
features['q0022'] = features['q0022'].apply(lambda x: 1 if x == 'Yes' else 0)

# demographics

# drop q 24, 25, 26, 28
cols_to_drop3 = ['q0024', 'q0025_0001', 'q0025_0002', 'q0025_0003', 'q0026', 'q0028']
features = features.drop(columns=cols_to_drop3)

# What is the last grade of school you completed?
features['q0029'] = features['q0029'].map({"Did not complete high school" : 0, "High school or G.E.D." : 1, "Associate's degree": 2,
                               "Some college" : 3, "College graduate" : 4, "Post graduate degree" : 5})
features = features.drop(columns='q0029')

# drop q30 - what state do you live in?
features = features.drop(columns='q0030')

features['q0034'] = features['q0034'].map({"$0-$9,999" : 0, "$10,000-$24,999" : 1, "$25,000-$49,999": 2, "$50,000-$74,999" : 3,
                               "$75,000-$99,999" : 4, "$100,000-$124,999" : 5, "$125,000-$149,999" : 6, "$150,000-$174,999" : 7,
                               "$175,000-$199,999" : 8, "$200,000+" : 9 })
features = features.drop(columns='q0034')

# drop q 35, 36
features = features.drop(columns=['q0035', 'q0036'])

features['race2'] = features['race2'].apply(lambda x: 1 if x == 'White' else 0)
features = features.drop(columns='race2')

# drop 'racethn4'
features = features.drop(columns='racethn4')

features['educ3'] = features['educ3'].map({"High school or less" : 0, "Some college" : 1, "College or more" : 2})
features = features.drop(columns='educ3')

features['educ4'] = features['educ4'].map({"High school or less" : 0, "Some college" : 1, "College or more" : 2, "Post graduate degree" : 3})
features = features.drop(columns='educ4')

features['age3'] = features['age3'].map({'18 - 34' : 0, '35 - 64' : 1, '65 and up' : 2})
features = features.drop(columns='age3')

features['kids'] = features['kids'].apply(lambda x: 1 if x == 'Has children' else 0)
features = features.drop(columns='kids')

# drop 'orientation'
features = features.drop(columns='orientation')

#what is this weight column? ranges from 0.02 - 8.67?
# drop 'weight'
features = features.drop(columns='weight')

# model 7 - subset to just q7 and q8
cols78 = [col for col in features.columns if col.startswith('q0007') or col.startswith('q0008')]
features = features[cols78]

print(features.columns)

### Data Visualizations

In [None]:
cols = ['q0001', 'q0002']

subset = features[cols]

subset = subset.dropna()

print(pd.crosstab(subset[cols[0]], subset[cols[1]]))

plt.scatter(subset[cols[0]], subset[cols[1]], alpha = 0.1)
plt.show()

### Modeing

Going to use 2 clusters in each model: maybe clusters will group by "traditionally masculine" and "not traditionally masculine" group. maybe.

Goal: Start with a model that includes the most features, then subset to less and less features. Analyze results and groupings of each.

Shortcomings (less as model data gets smaller): probably too many features, some features may not even be relevant, many removed rows from nan

In [None]:
# drop all rows that are na, convert to ints, 2 clusters

def kMeansLabelsPCA(features_df):
    a = features_df.dropna()
    a = a.astype(int)
    scaler = StandardScaler()
    a_scaled = scaler.fit_transform(a)
    model = KMeans(init='k-means++', n_clusters=2, random_state=49)
    labels = model.fit_predict(a_scaled)
    #print(model.cluster_centers_)
    labeled_points = pd.DataFrame(labels, index=a.index, columns=['label'])
    
    # PCA to determine which features contribute most to variance
    pca = PCA()
    pca.fit(a_scaled)
    
    feature_importance = pd.DataFrame(pca.components_, columns=features.columns, index=[f'PC{i+1}' for i in range(len(features.columns))])
    
    # focus on first principal component
    pc1_contributions = feature_importance.loc['PC1']
    important_features_pc1 = pc1_contributions.abs().sort_values(ascending=False)
    
    return labeled_points, important_features_pc1

labels, feature_importance = kMeansLabelsPCA(features)
results = df.join(labels, how='inner')
print(len(results))

results.to_csv('model7.csv', index=False)

# Print feature importance for PC1
print("PC1 Feature Importance:")
print(feature_importance)

# further analysis section based on pca results
results1 = results1[['q0008_0012', 'q0007_0011', 'q0008_0004', 'label']]

columns_to_analyze = results1.columns[:-1]

for col in columns_to_analyze:
    counts = pd.crosstab(results1[col], results1['label'])
    print(f"Counts for {col}:\n{counts}\n")

### Results Notes:
Model 1 (most features, see first cell) - q8 and q7 most important by PCA.. appears to be anxious lonely people vs not.

Model 2 (model 1 minus weight) - same as model 1. anxious lonely men vs not. one sub q8 response change. this is sad.

Model 3 (drop redundant education col) - nearly identical to model 3, reducing education redundancy didn't change much.

Model 4 (only questions, cols starting with q) - same as models 2 and 3. 'q0008_0012', 'q0007_0011', 'q0008_0004' top 3 by PCA

Model 5 (only edu demographic, only survey responses thru q29) - same as models 2, 3 and 4. 'q0008_0012', 'q0007_0011', 'q0008_0004' top 3 by PCA

Model 6 (no demographics, only survey responses thru q22) - same as models 2, 3 and 4. 'q0008_0012', 'q0007_0011', 'q0008_0004' top 3 by PCA

Model 7 - seems clear that q7 and q8 make the biggest differece. Subset to just those qs. 'q0008_0012', 'q0007_0011', 'q0008_0004' top 3 by PCA.

### Conclusion

Using K means clustering, the majority of variability between the two clusters of men who answered this survey was explained by the following:

Cluster 1: 
- Were not worried about any of the common insecurities in question 8 (q0008_0012)
- Were not worried about their physique (q0008_0004)
- Were not seeing a therapist and were less open to the idea (q0007_0011)

Cluster 2:
- Worried about one or more of the insecurities listed in question 8 (q0008_0012)
- Were more likely to be concerned about their physique (q0008_0004)
- Were more likely to have seen or are actively seeing a therapist (q0007_0011)