# Project 2

### Prompt:
 Identify a large 2-node network dataset—you can start with a dataset in a repository.  Your data should meet the criteria that it consists of ties between and not within two (or more) distinct groups.
    Reduce the size of the network using a method such as the island method described in chapter 4 of social network analysis.
    What can you infer about each of the distinct groups?

You may work in a small group on the project.

Your code and analysis should be delivered in an IPython Notebook by end of day Sunday. 

### Data

https://snap.stanford.edu/data/wiki-Elec.html

### Libraries

In [79]:
import pandas as pd
import os
import io
import networkx as nx
import re

importedScipy = False
try:    
    import scipy
    importedScipy = True
except:
    pass
try:    
    import nltk
except:
    pass

In [80]:
print ("Pandas Version {}".format(pd.__version__))
print ("Newtworkx Version {}".format(nx.__version__))
if (importedScipy):
    print ("scipy Version {}".format(scipy.__version__))
else:
    print("scipy wasn't imported.")
print("NLTK Version {}".format(nltk.__version__))

Pandas Version 1.3.5
Newtworkx Version 2.6.3
scipy wasn't imported.
NLTK Version 3.7


### Read in Data

In [81]:
#The filename may have to be renamed with the .txt extention.  That has to be done manually.
#This was only the case on one of my machines for some reason??
filename = "wikiElec.ElecBs3.txt"

In [82]:
#This method works for a portion of the file but not the entire file, there's some encoding issue.

#file = io.open(filename, mode="r", encoding="utf-8")
#print(file.read(5555))

In [83]:
with open(filename, 'rb') as f:
  data = f.read()

In [84]:
data = str(data)

In [85]:
#data[:500]

In [86]:
#start of data. before character 392 is an explanation
str(data).find("E\\t1")

392

In [87]:
dataDocs = data[:391]

In [88]:
#dataDocs can be called to see the data explanation that came from the file
dataDocs

'b"# Wikipedia elections (http://cs.stanford.edu/people/jure/pubs/triads-chi10.pdf). Data format:\\r\\n#   E: is election succesful (1) or not (0)\\r\\n#   T: time election was closed\\r\\n#   U: user id (and username) of editor that is being considered for promotion\\r\\n#   N: user id (and username) of the nominator\\r\\n#   V: <vote(1:support, 0:neutral, -1:oppose)> <user_id> <time> <username>\\r\\'

In [89]:
#remove the explanation from the data
data = data[392:]

#### make a class to store the data

In [90]:
#TODO, might make work easier later.  For now, making seprate lists to combine into DF
class election:
    def __init__(self,outcome):
        if( (type(outcome)==int) & (outcome >=0) & (outcome <=1)):
            self.outcome=outcome
        else:
            print('invalid election outcome, {}'.format(str(outcome)))
            self.outcome= "-"

### Parse Data

In [91]:
electionsText = []

In [92]:
len(data)

5314884

In [93]:
#this loop maekes each election a seprate element of a list by finding each "E"
# "E" means new elections (see dataDocs a few cells above)
election = ""
for i in range(len(data)):
    if (data[i] == "E"):
        if(data[i+1]==("\\")):
            if(data[i+2]=="t"):
                electionsText.append(election)
                election=str(data[i])
    else:
        election = election +data[i]

In [94]:
len(electionsText)

2794

In [95]:
#remove first blank entry due to loop
electionsText = electionsText[1:]

In [96]:
len(electionsText)
#there were 2793 elections

2793

In [97]:
#Each element of this list is a mess and the data needs to be parsed out.
print(electionsText[0])

E\t1\r\nT\t2004-09-21 01:15:53\r\nU\t30\tcjcurrie\r\nN\t32\tandyl\r\nV\t1\t3\t2004-09-14 16:26:00\tludraman\r\nV\t-1\t25\t2004-09-14 16:53:00\tblankfaze\r\nV\t1\t4\t2004-09-14 17:08:00\tgzornenplatz\r\nV\t1\t5\t2004-09-14 17:37:00\torthogonal\r\nV\t1\t6\t2004-09-14 19:28:00\tandrevan\r\nV\t1\t7\t2004-09-14 19:37:00\ttexture\r\nV\t1\t8\t2004-09-14 21:04:00\tlst27\r\nV\t1\t9\t2004-09-14 21:30:00\tmirv\r\nV\t1\t10\t2004-09-14 22:13:00\tan\xc3\xa1rion\r\nV\t0\t26\t2004-09-14 22:18:00\tgrunt\r\nV\t0\t27\t2004-09-15 03:19:00\tslowking\r\nV\t0\t28\t2004-09-15 03:20:00\tneutrality\r\nV\t1\t11\t2004-09-15 04:28:00\tmerovingian\r\nV\t1\t12\t2004-09-15 06:56:00\twile\r\nV\t1\t13\t2004-09-15 09:19:00\tsjc\r\nV\t1\t14\t2004-09-15 12:20:00\t172\r\nV\t0\t29\t2004-09-16 00:58:00\tugen64\r\nV\t1\t15\t2004-09-16 14:50:00\tdanny\r\nV\t1\t16\t2004-09-16 15:31:00\tsimonp\r\nV\t1\t17\t2004-09-17 13:49:00\tjwrosenzweig\r\nV\t1\t18\t2004-09-17 20:57:00\tadam\r\nV\t1\t19\t2004-09-17 22:11:00\tffirehorse\r\nV\t

In [98]:
#The third char is the outcome result.  
#So this list is a list of all the election outcomes.  Election outcomes might eventually be
#a node attribute. 
electionOutcomes =[]
for i in electionsText:
    electionOutcomes.append(i[3])

In [99]:
len(electionOutcomes)
#same length as data texts

2793

In [100]:
electionOutcomes[:15]
#works

['1', '1', '1', '1', '1', '0', '1', '0', '0', '0', '1', '0', '1', '0', '1']

In [101]:
electionsText[3].find("U")

beingVotedOnRaw[1233]
#The above loop worked to get raw text like this one.

'U\\t465\\tmoink\\r\\n'

In [102]:
#beingVotedOnRaw is the raw text of who was voted on in the elction.  There's also a user ID 
#number in the text.  Either the name or the text could be the node "name" / identifier
#for a given election. 
beingVotedOnRaw =[]
for i in electionsText:
    beingVotedOnRaw.append(i[i.find("U"):i.find("N")])
#This find the text between U and N which, from the Docs, is the username and ID num up
#for election.

In [103]:
beingVotedOn = [] #Node names, ie. "who's up for election"

#re.split("\d+",beingVotedOnRaw[1233]) #This line splits the raw text to get what 
#comes after the number, so you can just find the name

In [104]:
#The following code is one parsing job of the user id,
#which happens in a loop in the next cell.  You can uncomment it for testing.

#match2 = re.search("[a-z]+",re.split("\d+",beingVotedOnRaw[1233])[1])
#match2.span()
#re.split("\d+",beingVotedOnRaw[0])[1])[match2.span()[1]]
#re.split("\d+",beingVotedOnRaw[1233])[1][match2.span()[0]:match2.span()[1]][1:]

In [105]:
for i in range(len(electionsText)):
    match2 = re.search("[a-z]+",re.split("\d+",beingVotedOnRaw[i])[1])
    beingVotedOn.\
    append(
        re.split("\d+",beingVotedOnRaw[i])[1][match2.span()[0]:match2.span()[1]][1:]      
        )

In [106]:
for i in range(8):
    print(beingVotedOn[i] + "  from this raw text:\t" +  electionsText[i][35:60])
#works

cjcurrie  from this raw text:	\t30\tcjcurrie\r\nN\t32\t
zoney  from this raw text:	\t54\tzoney\r\nN\t28\tneu
gerald  from this raw text:	\t61\tgerald_farinas\r\nN
andrevan  from this raw text:	\t6\tandrevan\r\nN\t70\tn
arminius  from this raw text:	\t38\tarminius\r\nN\t55\t
lst  from this raw text:	\t8\tlst27\r\nN\t-1\tUNKN
chmod  from this raw text:	\t33\tchmod007\r\nN\t-1\t
taoster  from this raw text:	\t93\ttaoster\r\nN\t-1\tU


## Disregard all code below here!

## TODO: Get the votes from the raw text, who voted for who.  So it'll have to be some sort of dictionary or custom class --> "A voted for B", "C voted for B", "D voted for B", etc.

In [107]:
#this gets the user associated with the election
#electionsText[3][electionsText[3].find("U"):electionsText[3].find("N")]

In [108]:
#intInQuestion= 3
#userRaw = re.search("U",electionsText[intInQuestion])
#userRaw2 = re.search("N",electionsText[intInQuestion])#.find("U"))#:electionsText[intInQuestion].find("N"))
#print(electionsText[intInQuestion][userRaw.span()[0]:userRaw2.span()[1]])


In [109]:
#userIDMatchObject = re.search("\d+",electionsText[intInQuestion]\
#                              [electionsText[intInQuestion].find("U"):electionsText[intInQuestion].find("N")])
#print(userIDMatchObject)
#UserID = electionsText[userIDMatchObject.span()[0]:userIDMatchObject.span()[1]]
#UserID

In [110]:
#dir(userID)
#userID.span()

In [111]:
#userIDExtracted  = ""
#for i in range(userID.span()[0],userID.span()[1]):
#    userIDExtracted = userIDExtracted + userID[i]


In [112]:
#userID[3]