# Brands Importer
## Import Wikipedia's list of car brands into a file

### 1. Create BeautifulSoup

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/w/index.php?title=List_of_car_brands'

#Response to check
response = requests.get(url)
response

<Response [200]>

In [2]:
#Make the soup
html = requests.get(url).content
soup = BeautifulSoup(html, "lxml")

In [3]:
#<li> tag selector
text = [element.text for element in soup.find_all('li')]

#Remove unwanted lines before and after companies
discard_first_lines = 130
discard_last_lines = 250

text = text[discard_first_lines:-discard_last_lines]
text[:20]

['Adelmo',
 'ASA (1985–present)',
 'de Carlo',
 'Feresa',
 'Koller',
 'Oliva',
 'Winograd',
 'Anasagasti (1911–1915)',
 'Andino (1967–1973)',
 'Hispano-Argentina (1925–1953)',
 'Eniak (1983–1989)',
 'Industrias Aeronáuticas y Mecánicas del Estado (IAME, Mechanical Aircraft Industries of the State, 1951–1979), not to be confused with Italian American Motor Engineering',
 'Industrias Eduardo Sal-Lari\xa0[es] (IES, 1983–1990)',
 'Industrias Kaiser Argentina (IKA, 1956–1975) United Kingdom',
 'Birchfield (2003–2004)',
 'Bolwell (1963–present)',
 'Bullet (1996–present)',
 'Carbontech[1] (1999–present)',
 'Classic Glass',
 'Daytona (2002–present)']

### 2. Clean text

In [4]:
#Import brands into a dataframe
brands = pd.DataFrame(text, columns=['name']) 
brands.head()

Unnamed: 0,name
0,Adelmo
1,ASA (1985–present)
2,de Carlo
3,Feresa
4,Koller


In [5]:
#Remove everything in and after parentheses
brands['name'] = brands.name.replace({r'[ +?]\(.*': ''}, regex=True) 

#Remove everything in and after brackets
brands['name'] = brands.name.replace({r'[ +]?\[.*': ''}, regex=True) 

#Remove annotations after commas
brands['name'] = brands.name.replace({r',.*': ''}, regex=True)

#Remove alt names
brands['name'] = brands.name.replace({r'\n.*': ''}, regex=True)

#Remove non-breaking spaces
brands['name'] = brands.name.replace({r' ': ''}, regex=True)

brands

Unnamed: 0,name
0,Adelmo
1,ASA
2,de Carlo
3,Feresa
4,Koller
5,Oliva
6,Winograd
7,Anasagasti
8,Andino
9,Hispano-Argentina


In [6]:
#Add not listed brands
brands = brands.append({'name':'Kawasaki'},ignore_index=True)
brands = brands.append({'name':'Mustang'},ignore_index=True)
brands = brands.append({'name':'Hummer'},ignore_index=True)

### 3. Export file

In [7]:
#Lowercase (situational)
brands['name'] = brands['name'].str.lower()

#Replace whitespaces with dashes (situational)
brands['name'] = brands.name.replace({r' ':'-'}, regex=True)

In [8]:
#Export the dataframe
brands.to_csv(r'data/brands.csv')