# Wikipedia scraping and KMeans clustering
The example below is taken and adapted from a workshop example from the University of Exeter.

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import wikipedia

In [23]:
# Obtain formatted wiki pages for Exeter and Pizza
p_wiki = wikipedia.page("University_of_Exeter")
page_1 = p_wiki.content.replace("\n", "").split(sep='.')

p_wiki = wikipedia.page("Pizza")
page_2 = p_wiki.content.replace("\n", "").split(sep='.')

# Create a single corpus and define each word as separate features
documents = page_1 + page_2
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

# Create clusters with KMeans
true_k = 2
model = KMeans(n_clusters=true_k, max_iter=100)
model.fit(X)

# Display features for each cluster
print("Top teams per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d" % i),
    print(terms)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

# Predict labels for samples
Y = vectorizer.transform(["I live in Devon."])
prediction = model.predict(Y)
print(prediction)

Y = vectorizer.transform(["You cook it in the oven"])
prediction = model.predict(Y)
print(prediction)

Top teams per cluster:
Cluster 0
['000', '007', '040', '0761149446', '08', '10', '100', '101', '106', '10th', '110', '11th', '120', '128', '12th', '13', '13th', '14', '150', '156th', '16', '17', '17th', '18', '1830', '1838', '1840', '184th', '1851', '1853', '1854', '1855', '1860s', '1863', '1868', '1888', '1889', '1893', '18th', '19', '1900', '1905', '1912', '1920', '1922', '1926', '1930', '1931', '1933', '1939', '1955', '1957', '1959', '1960s', '1962', '1963', '1965', '1966', '1968', '1970s', '1972', '1974', '1975', '1976', '1978', '1980', '1980s', '1981', '1983', '1984', '1985', '1986', '1987', '1990', '1990s', '1991', '1993', '1997', '1998', '1999', '19th', '20', '200', '2000', '2000s', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '208', '20th', '21st', '22', '220', '225876066', '23', '235', '2368', '24', '261', '26th', '27', '3000', '30th', '31', '333', '34th', '36', '37', '38