In [1]:
import pandas as pd

# Load the data
data = pd.read_csv('Marvel_Comics.csv')

# Display the first few rows of the dataset
data.head()


Unnamed: 0,comic_name,active_years,issue_title,publish_date,issue_description,penciler,writer,cover_artist,Imprint,Format,Rating,Price
0,A Year of Marvels: April Infinite Comic (2016),(2016),A Year of Marvels: April Infinite Comic (2016) #1,"April 01, 2016",The Infinite Comic that will have everyone tal...,Yves Bigerel,Yves Bigerel,Jamal Campbell,Marvel Universe,Infinite Comic,Rated T+,Free
1,A Year of Marvels: August Infinite Comic (2016),(2016),A Year of Marvels: August Infinite Comic (2016...,"August 10, 2016","It’s August, and Nick Fury is just in time to ...",Jamal Campbell,"Chris Sims, Chad Bowers",,Marvel Universe,Infinite Comic,,Free
2,A Year of Marvels: February Infinite Comic (2016),(2016),A Year of Marvels: February Infinite Comic (20...,"February 10, 2016",Join us in a brand new Marvel comics adventure...,"Danilo S. Beyruth, M Mast",Ryan North,,Marvel Universe,Infinite Comic,Rated T+,Free
3,A Year of Marvels: July Infinite Comic (2016),(2016),A Year of Marvels: July Infinite Comic (2016) #1,"June 29, 2016",Celebrating the Fourth of July is complicated ...,Juanan Ramirez,Chuck Wendig,Jamal Campbell,Marvel Universe,Infinite Comic,,Free
4,A Year of Marvels: June Infinite Comic (2016),(2016),A Year of Marvels: June Infinite Comic (2016) #1,"June 15, 2016",Sam Alexander’s finding it hard to cope with t...,Diego Olortegui,Paul Allor,Jamal Campbell,Marvel Universe,Infinite Comic,,Free


### First attempt at finding out most used words

In [2]:
import re
from collections import Counter

def extract_capitalized_words(text):
    # Use regex to find sequences of capitalized words
    return re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)

# Combine the comic_name and issue_title columns into a single text
combined_text = data['comic_name'].str.cat(data['issue_title'], sep=' ').str.cat(data['issue_description'], sep=' ')

# Extract capitalized words from the combined text
capitalized_words = combined_text.apply(extract_capitalized_words).explode().dropna()

# Count the occurrences of each capitalized word/sequence
word_counts = Counter(capitalized_words)

# Get the top 10 most common capitalized words/sequences
most_common_words = word_counts.most_common(10)
most_common_words


[('Man', 9686),
 ('Men', 7537),
 ('The', 6918),
 ('Spider', 6089),
 ('But', 4581),
 ('None', 4087),
 ('And', 4079),
 ('Avengers', 3560),
 ('Marvel', 2887),
 ('Captain America', 2858)]

### First attempt at finding out most used words

In [3]:
def refined_extract(text):
    # Adjust regex to consider words connected by a hyphen as a single word
    return re.findall(r'\b[A-Z][a-z]+(?:[-\s]+[A-Z][a-z]+)*\b', text)

# Extract refined capitalized words from the combined text
refined_words = combined_text.apply(refined_extract).explode().dropna()

# Count the occurrences of each refined word/sequence
refined_word_counts = Counter(refined_words)

# Exclude the word "None", common articles, and conjunctions from the top list
exclude_words = {"None", "The", "But", "And", "A", "Or", "For", "With", "As", "In", "To", "Of", "Is", "On"}

top_refined_words = [(word, count) for word, count in refined_word_counts.most_common() if word not in exclude_words]

# Get the top 10 refined words/sequences
top_10_refined = top_refined_words[:10]
top_10_refined


[('Men', 7438),
 ('Spider-Man', 4226),
 ('Avengers', 3536),
 ('Captain America', 2846),
 ('Marvel', 2838),
 ('Wolverine', 2716),
 ('Thor', 2600),
 ('It', 2534),
 ('Iron Man', 2380),
 ('Fantastic Four', 2282)]

### First attempt at analysing the most prolific writers

In [4]:
# Clean up the 'writer' column by splitting it on commas and ampersands to handle cases with multiple writers
writers_list = data['writer'].str.split(',|&').explode().str.strip()

# Associate each writer with the 'comic_name' and 'active_years' from the main dataset
writer_data = data.loc[writers_list.index, ['comic_name', 'active_years']]
writer_data['writer'] = writers_list.values

# Group by each writer to count the number of comics written and aggregate the years and comic names
grouped_writers = writer_data.groupby('writer').agg({
    'comic_name': ['count', lambda x: x.str.extractall(r'\b([A-Z][a-z]+(?:[-\s]+[A-Z][a-z]+)*)\b').value_counts().index[:5].tolist()],
    'active_years': 'unique'
}).sort_values(('comic_name', 'count'), ascending=False)

# Rename columns for clarity
grouped_writers.columns = ['Number of Comics Written', 'Top 5 Associated Superheroes', 'Years Written']

# Display the top writers based on the number of comics written
top_writers = grouped_writers.head(10)
top_writers


Unnamed: 0_level_0,Number of Comics Written,Top 5 Associated Superheroes,Years Written
writer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,7397,"[(Present,), (Men,), (Conan,), (Star Wars,), (...","[(2016), (1998 - 1999), (2019), (2008), (1970 ..."
Stan Lee,980,"[(Tales,), (Fantastic Four,), (Strange Tales,)...","[(1970 - 1975), (2006), (2007), (1941 - 1947),..."
Brian Michael Bendis,906,"[(Ultimate Spider-Man,), (New Avengers,), (Men...","[(2013), (2001 - 2003), (2012 - 2015), (1999 -..."
Chris Claremont,823,"[(Men,), (Uncanny,), (New Mutants,), (Treme,),...","[(1983 - 1994), (1967 - 1994), (2008 - 2009), ..."
Peter David,711,"[(Factor,), (Incredible Hulk,), (Dark Tower,),...","[(2012 - 2014), (2019), (2014 - 2015), (1999 -..."
Roy Thomas,586,"[(Marvel Illustrated,), (Doctor Strange,), (Me...","[(1970 - 1976), (1970), (1963 - 1996), (1998 -..."
Tom Defalco,510,"[(Fantastic Four,), (The Amazing Spider-Man,),...","[(1998 - 1999), (2006 - 2009), (1964 - 2018), ..."
Fabian Nicieza,419,"[(Cable,), (Deadpool,), (Force,), (Men,), (Thu...","[(1991 - 1992), (2015), (1983 - 1994), (1963 -..."
Jason Aaron,399,"[(Wolverine,), (Men,), (Thor,), (Star Wars,), ...","[(2019), (2013 - 2015), (2010 - 2011), (2010),..."
Mark Waid,372,"[(Daredevil,), (Captain America,), (Avengers,)...","[(2013), (2015 - 2016), (1999 - 2013), (2018),..."


### First attempt at analysing the most prolific writers

In [5]:
# Remove the "None" entry
top_writers = top_writers.drop("None")

def format_years(years_list):
    # Extract years from strings like "(1967 - 1968)" or "(1967)" and convert them to integers
    years = sorted([int(year) for sublist in [re.findall(r'(\d+)', year_interval) for year_interval in years_list] for year in sublist])
    
    if not years:
        return ""
    
    # Group consecutive years
    groups = []
    current_group = [years[0]]
    for y in years[1:]:
        if y - current_group[-1] == 1:
            current_group.append(y)
        else:
            groups.append(current_group)
            current_group = [y]
    groups.append(current_group)

    # Format years as intervals or single years
    formatted_years = [f"{group[0]}-{group[-1]}" if len(group) > 1 else str(group[0]) for group in groups]
    return ", ".join(formatted_years)

# Apply the format_years function to the 'Years Written' column
top_writers['Years Written'] = top_writers['Years Written'].apply(format_years)

top_writers


Unnamed: 0_level_0,Number of Comics Written,Top 5 Associated Superheroes,Years Written
writer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Stan Lee,980,"[(Tales,), (Fantastic Four,), (Strange Tales,)...","1941, 1941, 1945, 1947-1948, 1948-1951, 1951-1..."
Brian Michael Bendis,906,"[(Ultimate Spider-Man,), (New Avengers,), (Men...","1998, 1998-2000, 2000, 2000-2001, 2001, 2001, ..."
Chris Claremont,823,"[(Men,), (Uncanny,), (New Mutants,), (Treme,),...","1962, 1964, 1966-1967, 1967-1968, 1968, 1970-1..."
Peter David,711,"[(Factor,), (Incredible Hulk,), (Dark Tower,),...","1962-1963, 1976, 1976, 1985-1986, 1986, 1986, ..."
Roy Thomas,586,"[(Marvel Illustrated,), (Doctor Strange,), (Me...","1951, 1959, 1961-1963, 1963, 1963, 1963, 1963-..."
Tom Defalco,510,"[(Fantastic Four,), (The Amazing Spider-Man,),...","1961, 1963, 1963-1964, 1966-1968, 1972, 1974, ..."
Fabian Nicieza,419,"[(Cable,), (Deadpool,), (Force,), (Men,), (Thu...","1963-1964, 1967-1968, 1970, 1976, 1981, 1983, ..."
Jason Aaron,399,"[(Wolverine,), (Men,), (Thor,), (Star Wars,), ...","2003, 2005-2006, 2008, 2008, 2008-2009, 2009, ..."
Mark Waid,372,"[(Daredevil,), (Captain America,), (Avengers,)...","1963, 1968, 1981, 1991, 1993-1996, 1996, 1996-..."


### Examining the descriptions of comics to spot patterns

In [6]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Extract words from the 'issue_description' column
words_list = data['issue_description'].str.split().explode().str.lower()

# Filter out common English stop words (conjunctions, prepositions, etc.) and missing values ("none")
filtered_words = words_list[~words_list.isin(ENGLISH_STOP_WORDS) & (words_list != "none")]

# Count the occurrences of each word
word_counts = Counter(filtered_words)

# Get the top 20 most common words
top_20_words = word_counts.most_common(20)
top_20_words


[('new', 6737),
 ('-', 4480),
 ('marvel', 4083),
 ("it's", 3264),
 ('man', 2969),
 ('x-men', 2606),
 ('spider-man', 2508),
 ('avengers', 2405),
 ('captain', 2220),
 ('world', 2136),
 ('team', 2055),
 ('iron', 1996),
 ('battle', 1920),
 ('time', 1910),
 ('war', 1823),
 ('just', 1802),
 ('save', 1795),
 ('black', 1786),
 ('...$2.99', 1680),
 ('secret', 1624)]

In [7]:
# Get the words/phrases ranked 20th to 50th in terms of frequency
words_20_to_50 = word_counts.most_common(50)[20:50]
words_20_to_50

[("he's", 1595),
 ('heroes', 1543),
 ('life', 1508),
 ('32', 1464),
 ('fantastic', 1416),
 ('--', 1395),
 ('stop', 1365),
 ('does', 1362),
 ('hulk', 1292),
 ('make', 1255),
 ('peter', 1249),
 ('story', 1225),
 ('comes', 1218),
 ('face', 1211),
 ('way', 1200),
 ('power', 1165),
 ('america', 1161),
 ('fight', 1141),
 ('come', 1117),
 ('rated', 1107),
 ('mysterious', 1103),
 ('wolverine', 1094),
 ('thor', 1091),
 ('ultimate', 1068),
 ('universe', 1066),
 ('young', 1064),
 ('super', 1062),
 ('hero', 1052),
 ('end', 1046),
 ('dark', 1025)]