In [1]:
from tqdm import tqdm
from bs4 import BeautifulSoup 
import requests
from joblib import Parallel, delayed
import pickle
import pandas as pd
import numpy as np
from rapidfuzz import fuzz, process

### Scraping Wookiepedia:

In [143]:
category_link = 'https://starwars.fandom.com/wiki/Category:Canon_articles'  # all canon articles
original_link = 'https://starwars.fandom.com'

pages = {}
page_num = 1

while category_link is not None:
    req = requests.get(category_link)
    soup = BeautifulSoup(req.content, "html.parser")
    
    urls = soup.find_all('a', class_='category-page__member-link') # only get ones with class as category-page__member-link 
    links_before = len(pages)
    for link in urls:
        url = original_link + link.get('href')
        # We need to only get charachters and not categories, currently we are getting all links. We use the key to filter out categories:
        key = url.split('/')[-1]
        if 'Category:' not in key:
            pages[key] = url
        
    print(f'Page {page_num} - {len(pages) - links_before}')
    page_num += 1

    # get next page button using the class
    next_urls = soup.find_all("a", class_='category-page__pagination-next') # In the code on the website we see the letter class for the next page button is category-page__pagination-next
    if next_urls:
        new_url = next_urls[0].get('href')
        if new_url == category_link:
            break 
        else:
            category_link = new_url
    else:
        break
        
print(f'Number of pages: {len(pages)}')



Page 1 - 199
Page 2 - 200
Page 3 - 200
Page 4 - 200
Page 5 - 200
Page 6 - 200
Page 7 - 200
Page 8 - 200
Page 9 - 200
Page 10 - 200
Page 11 - 200
Page 12 - 200
Page 13 - 200
Page 14 - 200
Page 15 - 200
Page 16 - 200






Page 17 - 200


Processing pages:   7%|██▋                                   | 3480/49445 [01:06<11:17, 67.84page/s][A[A

Page 18 - 200
Page 19 - 200
Page 20 - 200
Page 21 - 200
Page 22 - 200
Page 23 - 200
Page 24 - 200
Page 25 - 200
Page 26 - 200
Page 27 - 200
Page 28 - 200
Page 29 - 200
Page 30 - 200
Page 31 - 199
Page 32 - 200
Page 33 - 200
Page 34 - 200
Page 35 - 200
Page 36 - 200
Page 37 - 200
Page 38 - 200
Page 39 - 200
Page 40 - 200
Page 41 - 200
Page 42 - 200
Page 43 - 200
Page 44 - 200
Page 45 - 200
Page 46 - 200
Page 47 - 200
Page 48 - 200
Page 49 - 200
Page 50 - 200
Page 51 - 200
Page 52 - 200
Page 53 - 200
Page 54 - 200
Page 55 - 199
Page 56 - 200
Page 57 - 200
Page 58 - 200
Page 59 - 200
Page 60 - 200
Page 61 - 199
Page 62 - 200
Page 63 - 200
Page 64 - 200
Page 65 - 200
Page 66 - 200
Page 67 - 200
Page 68 - 200
Page 69 - 200
Page 70 - 200
Page 71 - 200
Page 72 - 200
Page 73 - 198
Page 74 - 200
Page 75 - 200
Page 76 - 200
Page 77 - 200
Page 78 - 200
Page 79 - 200
Page 80 - 199
Page 81 - 200
Page 82 - 200
Page 83 - 200
Page 84 - 200
Page 85 - 200
Page 86 - 200
Page 87 - 200
Page 88 - 200
Page 8

#### Saving data

In [146]:
# serialize the dictionary to a pickle file to make it easier to load later
with open("../data/pages_dict.pkl", "wb") as f:
    pickle.dump(pages, f)

---

#### Reopen data:

In [3]:
# deserialize the dictionary and print it out
with open("../data/pages_dict.pkl", "rb") as f:
    pages = pickle.load(f)
len(pages)

49445

### Process all pages to find their attributes in the sidebar:

In [4]:
def process_page(url):
    req = requests.get(url)
    soup = BeautifulSoup(req.content, "html.parser")

    # get the name of the character
    name = soup.find('h1',class_='page-header__title').text.strip()

    # Get sidebar with info
    sidebar = soup.find('aside')
    # collect all relevant info in the sidebar
    info = {}

    for item in sidebar.find_all('div', class_='pi-item pi-data pi-item-spacing pi-border-color'):
        #get the data:
        info[item["data-source"]] = item.find('div', class_='pi-data-value pi-font').text.strip()

    # remove the references in format [X] from the data
    for key, value in info.items():
        if value is not None:
            info[key] = value.split('[')[0].strip()
        else:
            info[key] = None
    return name,info


# Process pages in parallel
characters = {}
results = Parallel(n_jobs=-1)(
    delayed(process_page)(pages[key]) for key in tqdm(pages.keys(), desc="Processing pages", unit="page", ncols=100)
)

# Collect results
for name, info in results:
    if name and info:
        characters[name] = info

Processing pages: 100%|█████████████████████████████████████| 49445/49445 [13:48<00:00, 59.71page/s]


#### Saving data:

In [5]:
# serialize the dictionary to a pickle file to make it easier to load later
with open("../data/characters_info_only_filled_att.pkl", "wb") as f:
    pickle.dump(characters, f)

---

#### Reopen data:

In [13]:
# deserialize the dictionary and print it out
with open("../data/characters_info_only_filled_att.pkl", "rb") as f:
    characters = pickle.load(f)
len(characters)

42826

### Creating pandas dataframe and filtering:

In [14]:
# create a pandas dataframe from the dictionary
df = pd.DataFrame.from_dict(characters, orient='index')
df

Unnamed: 0,manufacturer,type,cost,purpose,species,affiliation,gender,pronouns,hair,skin,...,albums,form,teams,depth,candidates,electorate,stars,arms,satellites,incubationperiod
"""Accu-Strike"" integrated targeting computer",Industrial Automaton,Targeting computer,"3,000 credits",Providing targeting data and improving accuracy,,,,,,,...,,,,,,,,,,
"""Autoguard"" Cybernetic Reflex Suite",Czerka Arms,Droid brain defense system,"5,000 credits",Warn its user from incoming threats,,,,,,,...,,,,,,,,,,
"""Black Hole Box"" T-11 Armored Databank",Astralor Corp.,Armored computer core,"1,100 credits",Protect information from a starship's full des...,,,,,,,...,,,,,,,,,,
"""Blinder"" 4B2 jamming array",Irilliad,Sensor jammer,,,,,,,,,...,,,,,,,,,,
"""Breaker"" heavy hydrospanner",Regallis Engineering,Hydrospanner,250 credits,Fastening bolts,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zabrak scalp necrosis,,,,,,,,,,,...,,,,,,,,,,
Owauwaq Freight,,,,,,,,,,,...,,,,,,,,,,
Smoothie,,,,,,,,,,,...,,,,,,,,,,
Pipada Symphony Chamber Chorus,,,,,,,,,,,...,,,,,,,,,,


creating name coloumn and making all names same format

In [15]:
# giving first column a name so it is not empty
df.index.name = 'name'
df.reset_index(inplace=True)
# Alot of the names are using symbols and large and small letters so we need to clean them
df['name'] = df['name'].str.lower()
df['name'] = df['name'].str.strip()
df['name'] = df['name'].str.replace('"', '', regex=False)
df


Unnamed: 0,name,manufacturer,type,cost,purpose,species,affiliation,gender,pronouns,hair,...,albums,form,teams,depth,candidates,electorate,stars,arms,satellites,incubationperiod
0,accu-strike integrated targeting computer,Industrial Automaton,Targeting computer,"3,000 credits",Providing targeting data and improving accuracy,,,,,,...,,,,,,,,,,
1,autoguard cybernetic reflex suite,Czerka Arms,Droid brain defense system,"5,000 credits",Warn its user from incoming threats,,,,,,...,,,,,,,,,,
2,black hole box t-11 armored databank,Astralor Corp.,Armored computer core,"1,100 credits",Protect information from a starship's full des...,,,,,,...,,,,,,,,,,
3,blinder 4b2 jamming array,Irilliad,Sensor jammer,,,,,,,,...,,,,,,,,,,
4,breaker heavy hydrospanner,Regallis Engineering,Hydrospanner,250 credits,Fastening bolts,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42821,zabrak scalp necrosis,,,,,,,,,,...,,,,,,,,,,
42822,owauwaq freight,,,,,,,,,,...,,,,,,,,,,
42823,smoothie,,,,,,,,,,...,,,,,,,,,,
42824,pipada symphony chamber chorus,,,,,,,,,,...,,,,,,,,,,


---

### Filtering to only include characters from the first six movies (ep 1-6). 

In [16]:
movie_char = pd.read_csv("../data/movies/movie_characters_mapping.csv",sep=';',index_col=0)
script_names = movie_char['translation_name'].str.lower().to_list()
len(script_names)

283

In [17]:
# Define a function to get best match 
def find_best_match(name, choices, threshold=80):
        match, score, _ = process.extractOne(name, choices, scorer=fuzz.token_set_ratio)
        if score >= threshold:
            return match
        return None

# Filter out rows where 'species', "affiliation",... is NaN
filtered_df = df[(df['species'].notna()) | (df["gender"].notna()) | df["class"].notna() | df["eyes"].notna() ]


# filtered_df = df[df['species'].notna()]
all_names = filtered_df['name'].unique().tolist()

# Find best matches for all movie script characters
matched_names = {}
for name in script_names:
    match = find_best_match(name, all_names)
    if match:
        matched_names[name] = match

# Filter the df_all by matched names
matched_df = filtered_df[filtered_df['name'].isin(matched_names.values())].copy()
matched_df.reset_index(drop=True, inplace=True)
matched_df['matched_name'] = matched_df['name'].map({v: k for k, v in matched_names.items()})
matched_df

Unnamed: 0,name,manufacturer,type,cost,purpose,species,affiliation,gender,pronouns,hair,...,form,teams,depth,candidates,electorate,stars,arms,satellites,incubationperiod,matched_name
0,2r-series medical droid,Cybot Galactica,,,,,,,,,...,,,,,,,,,,medical droid
1,aat driver battle droid,Baktoid Combat Automata,,,,,Confederacy of Independent Systems,Masculine programming,,,...,,,,,,,,,,battle droid
2,b2-ha super battle droid,Baktoid Combat Automata,,,,,Confederacy of Independent Systems,Masculine programming,,,...,,,,,,,,,,super battle droid 2
3,c-3po,Cybot Galactica,,,,,Skywalker family,Masculine programming,He/him,,...,,,,,,,,,,c-3po
4,c5 bartender droid,LeisureMech Enterprises,,,,,Alisandre Hotel,,,,...,,,,,,,,,,bartender
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,salbee,,,,,,Sarkin Enneb's criminal group,Female,She/her,,...,,,,,,,,,,sabe
161,unidentified sith emperor,,,,,,Sith Empire,Male,,,...,,,,,,,,,,emperor
162,bobbie,,,,,,,Feminine programming,,,...,,,,,,,,,,hobbie
163,gungan,,,,,,,,,,...,,,,,,,,,,gungan lookout


In [18]:
# Only keep columns with at least 10% filled
threshold = 0.1  # 10% filled
matched_df = matched_df.loc[:, matched_df.notna().mean() >= threshold]
matched_df

Unnamed: 0,name,species,affiliation,gender,pronouns,hair,skin,homeworld,death,height,eyes,class,mass,apprentices,birth,families,parents,children,matched_name
0,2r-series medical droid,,,,,,,,,,,Medical droid,,,,,,,medical droid
1,aat driver battle droid,,Confederacy of Independent Systems,Masculine programming,,,,,,"1.93 meters (6 ft, 4 in)",,Battle droid,,,,,,,battle droid
2,b2-ha super battle droid,,Confederacy of Independent Systems,Masculine programming,,,,,,"1.93 meters (6 ft, 4 in)",,Battle droid,,,,,,,super battle droid 2
3,c-3po,,Skywalker family,Masculine programming,He/him,,,Tatooine,"0 ABY,","1.77 meters (5 ft, 10 in)",,Protocol droid,75 kilograms,,Between 34 BBY and 32 BBY,,,,c-3po
4,c5 bartender droid,,Alisandre Hotel,,,,,,,,,Bartender droid,,,,,,,bartender
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,salbee,,Sarkin Enneb's criminal group,Female,She/her,,,,,Taller than 1.58 meters,,,,,c. 31 BBY,,,,sabe
161,unidentified sith emperor,,Sith Empire,Male,,,,,,,,,,,,,,,emperor
162,bobbie,,,Feminine programming,,,,,,1.7 meters,,,,,,,,,hobbie
163,gungan,,,,,,,,,2 meters,,Amphibian,75 kg (165lbs),,,,,,gungan lookout


In [19]:
# save the dataframe to a csv file
matched_df.to_csv("../data/webscrape_script_char.csv", sep=';', index=False)